diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..33b5f782 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/$(top_builddir)/ +/autom4te.cache/ +/INSTALL +/Makefile +/Makefile.in +/aclocal.m4 +/config.log +/config.status +/configure +/depcomp +/install-sh +/missing diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..aff867d7 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +Daniel Mapleson +Bernardo Clavijo + diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..e4ccb627 --- /dev/null +++ b/COPYING @@ -0,0 +1,223 @@ +GNU GENERAL PUBLIC LICENSE + +Version 3, 29 June 2007 + +Copyright © 2007 Free Software Foundation, Inc. + +Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. + +Preamble + +The GNU General Public License is a free, copyleft license for software and other kinds of works. + +The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. + +When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. + +To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. + +For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. + +Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. + +For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. + +Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. + +Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. + +The precise terms and conditions for copying, distribution and modification follow. + +TERMS AND CONDITIONS + +0. Definitions. +“This License” refers to version 3 of the GNU General Public License. + +“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. + +“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. + +To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. + +A “covered work” means either the unmodified Program or a work based on the Program. + +To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. + +To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. + +An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. + +1. Source Code. +The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. + +A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. + +The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. + +The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same work. + +2. Basic Permissions. +All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. + +Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. + +3. Protecting Users' Legal Rights From Anti-Circumvention Law. +No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. + +When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. + +4. Conveying Verbatim Copies. +You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. + +5. Conveying Modified Source Versions. +You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: + +a) The work must carry prominent notices stating that you modified it, and giving a relevant date. +b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”. +c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. +d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. +A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. + +6. Conveying Non-Source Forms. +You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: + +a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. +b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. +c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. +d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. +e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. +A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. + +A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. + +“Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. + +If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). + +The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. + +7. Additional Terms. +“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: + +a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or +b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or +c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or +d) Limiting the use for publicity purposes of names of licensors or authors of the material; or +e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or +f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. +All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. + +8. Termination. +You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). + +However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. + +Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. + +9. Acceptance Not Required for Having Copies. +You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. + +10. Automatic Licensing of Downstream Recipients. +Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. + +An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. + +11. Patents. +A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”. + +A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. + +In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. + +If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. + +A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. + +12. No Surrender of Others' Freedom. +If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. + +13. Use with the GNU Affero General Public License. +Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. + +14. Revised Versions of this License. +The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. + +Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. + +15. Disclaimer of Warranty. +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +16. Limitation of Liability. +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +17. Interpretation of Sections 15 and 16. +If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +Also add information on how to contact you by electronic and paper mail. + +If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. +The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an “about box”. + +You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . + +The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 00000000..e69de29b diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 00000000..5e136654 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,19 @@ +AUTOMAKE_OPTIONS = subdir-objects +ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} +EXTRA_DIST = README COPYING AUTHORS NEWS + +AM_LDFLAGS = -pthread +AM_CPPFLAGS = -Wall -Wno-sign-compare -Wnon-virtual-dtor -I$(top_srcdir)/src/inc +AM_CXXFLAGS = -g -O3 + + +bin_PROGRAMS = $(top_builddir)/bin/kat + +__top_builddir__bin_kat_SOURCES = src/sect/sect.hpp src/sect/sect_main.cc \ + src/kat.cc + +__top_builddir__bin_kat_LDADD = lib/libjellyfish-1.1.a lib/libz.a + + +dist_noinst_SCRIPTS = autogen.sh + diff --git a/NEWS b/NEWS new file mode 100644 index 00000000..e69de29b diff --git a/README b/README new file mode 100644 index 00000000..e69de29b diff --git a/autogen.sh b/autogen.sh new file mode 100644 index 00000000..eb24e309 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,7 @@ +#!/bin/sh -e + +test -n "$srcdir" || srcdir=`dirname "$0"` +test -n "$srcdir" || srcdir=. +autoreconf --force --install --verbose "$srcdir" +test -n "$NOCONFIGURE" || "$srcdir/configure" "$@" + diff --git a/clean.sh b/clean.sh new file mode 100644 index 00000000..d94e794a --- /dev/null +++ b/clean.sh @@ -0,0 +1,6 @@ +#!/bin/sh -e + +rm -r -f autom4te.cache autoscan.* Makefile.in aclocal.m4 configure depcomp INSTALL install-sh missing config.* *.scan stamp-h1 */*.o + + + diff --git a/configure.ac b/configure.ac new file mode 100644 index 00000000..c3fcd308 --- /dev/null +++ b/configure.ac @@ -0,0 +1,6 @@ +AC_INIT([Kmer Analysis Toolkit (KAT)], [0.1], [daniel.mapleson@tgac.ac.uk], [kat], [http://www.tgac.ac.uk]) +AM_INIT_AUTOMAKE([1.11 -Wall no-define]) +AC_PROG_CXX +AC_PROG_CC +AC_PROG_INSTALL +AC_OUTPUT(Makefile) diff --git a/lib/libjellyfish-1.1.a b/lib/libjellyfish-1.1.a new file mode 100644 index 00000000..55efd95a Binary files /dev/null and b/lib/libjellyfish-1.1.a differ diff --git a/lib/libjellyfish-1.1.lai b/lib/libjellyfish-1.1.lai new file mode 100644 index 00000000..b606aee6 --- /dev/null +++ b/lib/libjellyfish-1.1.lai @@ -0,0 +1,41 @@ +# libjellyfish-1.1.la - a libtool library file +# Generated by libtool (GNU libtool) 2.4.2 Debian-2.4.2-1.1 +# +# Please DO NOT delete this file! +# It is necessary for linking the library. + +# The name that we can dlopen(3). +dlname='libjellyfish-1.1.so.1' + +# Names of this library. +library_names='libjellyfish-1.1.so.1.0.1 libjellyfish-1.1.so.1 libjellyfish-1.1.so' + +# The name of the static archive. +old_library='libjellyfish-1.1.a' + +# Linker flags that can not go in dependency_libs. +inherited_linker_flags='' + +# Libraries that this one depends upon. +dependency_libs='' + +# Names of additional weak libraries provided by this library +weak_library_names='' + +# Version information for libjellyfish-1.1. +current=1 +age=0 +revision=1 + +# Is this an already installed library? +installed=yes + +# Should we warn about portability when linking against -modules? +shouldnotlink=no + +# Files to dlopen/dlpreopen +dlopen='' +dlpreopen='' + +# Directory that this library needs to be installed in: +libdir='/usr/local/lib' diff --git a/lib/libjellyfish-1.1.so.1.0.1 b/lib/libjellyfish-1.1.so.1.0.1 new file mode 100644 index 00000000..ecef437a Binary files /dev/null and b/lib/libjellyfish-1.1.so.1.0.1 differ diff --git a/lib/libz.a b/lib/libz.a new file mode 100644 index 00000000..5bd12c49 Binary files /dev/null and b/lib/libz.a differ diff --git a/lib/libz.so.1.2.8 b/lib/libz.so.1.2.8 new file mode 100644 index 00000000..4df143dd Binary files /dev/null and b/lib/libz.so.1.2.8 differ diff --git a/src/.deps/.gitignore b/src/.deps/.gitignore new file mode 100644 index 00000000..77147962 --- /dev/null +++ b/src/.deps/.gitignore @@ -0,0 +1,3 @@ +/*.dirstamp +/*.Po +/*.Tpo diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 00000000..9aafd03d --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +/*.dirstamp +/*.o +/*~ diff --git a/src/inc/gnuplot/gnuplot_i.hpp b/src/inc/gnuplot/gnuplot_i.hpp new file mode 100644 index 00000000..8909a222 --- /dev/null +++ b/src/inc/gnuplot/gnuplot_i.hpp @@ -0,0 +1,1992 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// \brief A C++ interface to gnuplot. +/// +/// +/// The interface uses pipes and so won't run on a system that doesn't have +/// POSIX pipe support Tested on Windows (MinGW and Visual C++) and Linux (GCC) +/// +/// Version history: +/// 0. C interface +/// by N. Devillard (27/01/03) +/// 1. C++ interface: direct translation from the C interface +/// by Rajarshi Guha (07/03/03) +/// 2. corrections for Win32 compatibility +/// by V. Chyzhdzenka (20/05/03) +/// 3. some member functions added, corrections for Win32 and Linux +/// compatibility +/// by M. Burgis (10/03/08) +/// +/// Requirements: +/// * gnuplot has to be installed (http://www.gnuplot.info/download.html) +/// * for Windows: set Path-Variable for Gnuplot path +/// (e.g. C:/program files/gnuplot/bin) +/// or set Gnuplot path with: +/// Gnuplot::set_GNUPlotPath(const std::string &path); +/// +//////////////////////////////////////////////////////////////////////////////// + + +#ifndef _GNUPLOT_PIPES_H_ +#define _GNUPLOT_PIPES_H_ + + +#include +#include +#include +#include +#include // for std::ostringstream +#include +#include +#include // for getenv() +#include // for std::list + + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) +//defined for 32 and 64-bit environments + #include // for _access(), _mktemp() + #define GP_MAX_TMP_FILES 27 // 27 temporary files it's Microsoft restriction +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) +//all UNIX-like OSs (Linux, *BSD, MacOSX, Solaris, ...) + #include // for access(), mkstemp() + #define GP_MAX_TMP_FILES 64 +#else + #error unsupported or unknown operating system +#endif + +//declare classes in global namespace + + +class GnuplotException : public std::runtime_error +{ + public: + GnuplotException(const std::string &msg) : std::runtime_error(msg){} +}; + + + +class Gnuplot +{ + private: + + //---------------------------------------------------------------------------------- + // member data + ///\brief pointer to the stream that can be used to write to the pipe + FILE *gnucmd; + ///\brief validation of gnuplot session + bool valid; + ///\brief true = 2d, false = 3d + bool two_dim; + ///\brief number of plots in session + int nplots; + ///\brief functions and data are displayed in a defined styles + std::string pstyle; + ///\brief interpolate and approximate data in defined styles (e.g. spline) + std::string smooth; + ///\brief list of created tmpfiles + std::vector tmpfile_list; + + //---------------------------------------------------------------------------------- + // static data + ///\brief number of all tmpfiles (number of tmpfiles restricted) + static int tmpfile_num; + ///\brief name of executed GNUPlot file + static std::string m_sGNUPlotFileName; + ///\brief gnuplot path + static std::string m_sGNUPlotPath; + ///\brief standart terminal, used by showonscreen + static std::string terminal_std; + + //---------------------------------------------------------------------------------- + // member functions (auxiliary functions) + // --------------------------------------------------- + ///\brief get_program_path(); and popen(); + /// + /// \param --> void + /// + /// \return <-- void + // --------------------------------------------------- + void init(); + // --------------------------------------------------- + ///\brief creates tmpfile and returns its name + /// + /// \param tmp --> points to the tempfile + /// + /// \return <-- the name of the tempfile + // --------------------------------------------------- + std::string create_tmpfile(std::ofstream &tmp); + + //---------------------------------------------------------------------------------- + ///\brief gnuplot path found? + /// + /// \param --- + /// + /// \return <-- found the gnuplot path (yes == true, no == false) + // --------------------------------------------------------------------------------- + static bool get_program_path(); + + // --------------------------------------------------------------------------------- + ///\brief checks if file is available + /// + /// \param filename --> the filename + /// \param mode --> the mode [optional,default value = 0] + /// + /// \return file exists (yes == true, no == false) + // --------------------------------------------------------------------------------- + bool file_available(const std::string &filename); + + // --------------------------------------------------------------------------------- + ///\brief checks if file exists + /// + /// \param filename --> the filename + /// \param mode --> the mode [optional,default value = 0] + /// + /// \return file exists (yes == true, no == false) + // --------------------------------------------------------------------------------- + static bool file_exists(const std::string &filename, int mode=0); + + public: + + // ---------------------------------------------------------------------------- + /// \brief optional function: set Gnuplot path manual + /// attention: for windows: path with slash '/' not backslash '\' + /// + /// \param path --> the gnuplot path + /// + /// \return true on success, false otherwise + // ---------------------------------------------------------------------------- + static bool set_GNUPlotPath(const std::string &path); + + + // ---------------------------------------------------------------------------- + /// optional: set standart terminal, used by showonscreen + /// defaults: Windows - win, Linux - x11, Mac - aqua + /// + /// \param type --> the terminal type + /// + /// \return --- + // ---------------------------------------------------------------------------- + static void set_terminal_std(const std::string &type); + + //----------------------------------------------------------------------------- + // constructors + // ---------------------------------------------------------------------------- + + + ///\brief set a style during construction + Gnuplot(const std::string &style = "points"); + + /// plot a single std::vector at one go + Gnuplot(const std::vector &x, + const std::string &title = "", + const std::string &style = "points", + const std::string &labelx = "x", + const std::string &labely = "y"); + + /// plot pairs std::vector at one go + Gnuplot(const std::vector &x, + const std::vector &y, + const std::string &title = "", + const std::string &style = "points", + const std::string &labelx = "x", + const std::string &labely = "y"); + + /// plot triples std::vector at one go + Gnuplot(const std::vector &x, + const std::vector &y, + const std::vector &z, + const std::string &title = "", + const std::string &style = "points", + const std::string &labelx = "x", + const std::string &labely = "y", + const std::string &labelz = "z"); + + /// destructor: needed to delete temporary files + ~Gnuplot(); + + + //---------------------------------------------------------------------------------- + + /// send a command to gnuplot + Gnuplot& cmd(const std::string &cmdstr); + // --------------------------------------------------------------------------------- + ///\brief Sends a command to an active gnuplot session, identical to cmd() + /// send a command to gnuplot using the << operator + /// + /// \param cmdstr --> the command string + /// + /// \return <-- a reference to the gnuplot object + // --------------------------------------------------------------------------------- + inline Gnuplot& operator<<(const std::string &cmdstr){ + cmd(cmdstr); + return(*this); + } + + + + //---------------------------------------------------------------------------------- + // show on screen or write to file + + /// sets terminal type to terminal_std + Gnuplot& showonscreen(); // window output is set by default (win/x11/aqua) + + /// saves a gnuplot session to a postscript file, filename without extension + Gnuplot& savetops(const std::string &filename = "gnuplot_output"); + + /// saves a gnuplot session to a PNG file, filename without extension + Gnuplot& savetopng(const std::string &filename = "gnuplot_output"); + + /// saves a gnuplot session to a PNG file, filename without extension + Gnuplot& savetopdf(const std::string &filename = "gnuplot_output"); + + //---------------------------------------------------------------------------------- + // set and unset + + /// set line style (some of these styles require additional information): + /// lines, points, linespoints, impulses, dots, steps, fsteps, histeps, + /// boxes, histograms, filledcurves + Gnuplot& set_style(const std::string &stylestr = "points"); + + /// interpolation and approximation of data, arguments: + /// csplines, bezier, acsplines (for data values > 0), sbezier, unique, frequency + /// (works only with plot_x, plot_xy, plotfile_x, plotfile_xy + /// (if smooth is set, set_style has no effekt on data plotting) + Gnuplot& set_smooth(const std::string &stylestr = "csplines"); + + // ---------------------------------------------------------------------- + /// \brief unset smooth + /// attention: smooth is not set by default + /// + /// \param --- + /// + /// \return <-- a reference to a gnuplot object + // ---------------------------------------------------------------------- + inline Gnuplot& unset_smooth(){ smooth = ""; return *this;}; + + + /// scales the size of the points used in plots + Gnuplot& set_pointsize(const double pointsize = 1.0); + + /// turns grid on/off + inline Gnuplot& set_grid() {cmd("set grid");return *this;}; + /// grid is not set by default + inline Gnuplot& unset_grid(){cmd("unset grid");return *this;}; + + // ----------------------------------------------- + /// set the mulitplot mode + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& set_multiplot(){cmd("set multiplot") ;return *this;}; + + // ----------------------------------------------- + /// unsets the mulitplot mode + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& unset_multiplot(){cmd("unset multiplot");return *this;}; + + + + /// set sampling rate of functions, or for interpolating data + Gnuplot& set_samples(const int samples = 100); + /// set isoline density (grid) for plotting functions as surfaces (for 3d plots) + Gnuplot& set_isosamples(const int isolines = 10); + + // -------------------------------------------------------------------------- + /// enables/disables hidden line removal for surface plotting (for 3d plot) + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // -------------------------------------------------------------------------- + Gnuplot& set_hidden3d(){cmd("set hidden3d");return *this;}; + + // --------------------------------------------------------------------------- + /// hidden3d is not set by default + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // --------------------------------------------------------------------------- + inline Gnuplot& unset_hidden3d(){cmd("unset hidden3d"); return *this;}; + + /// enables/disables contour drawing for surfaces (for 3d plot) + /// base, surface, both + Gnuplot& set_contour(const std::string &position = "base"); + // -------------------------------------------------------------------------- + /// contour is not set by default, it disables contour drawing for surfaces + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ------------------------------------------------------------------ + inline Gnuplot& unset_contour(){cmd("unset contour");return *this;}; + + // ------------------------------------------------------------ + /// enables/disables the display of surfaces (for 3d plot) + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ------------------------------------------------------------------ + inline Gnuplot& set_surface(){cmd("set surface");return *this;}; + + // ---------------------------------------------------------- + /// surface is set by default, + /// it disables the display of surfaces (for 3d plot) + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ------------------------------------------------------------------ + inline Gnuplot& unset_surface(){cmd("unset surface"); return *this;} + + + /// switches legend on/off + /// position: inside/outside, left/center/right, top/center/bottom, nobox/box + Gnuplot& set_legend(const std::string &position = "default"); + + // ------------------------------------------------------------------ + /// \brief Switches legend off + /// attention:legend is set by default + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ------------------------------------------------------------------ + inline Gnuplot& unset_legend(){cmd("unset key"); return *this;} + + // ----------------------------------------------------------------------- + /// \brief sets and clears the title of a gnuplot session + /// + /// \param title --> the title of the plot [optional, default == ""] + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------------------------------- + inline Gnuplot& set_title(const std::string &title = "") + { + std::string cmdstr; + cmdstr = "set title \""; + cmdstr+=title; + cmdstr+="\""; + *this<set_title();return *this;} + + + /// set x axis label + Gnuplot& set_ylabel(const std::string &label = "x"); + /// set y axis label + Gnuplot& set_xlabel(const std::string &label = "y"); + /// set z axis label + Gnuplot& set_zlabel(const std::string &label = "z"); + + /// set axis - ranges + Gnuplot& set_xrange(const double iFrom, + const double iTo); + /// set y-axis - ranges + Gnuplot& set_yrange(const double iFrom, + const double iTo); + /// set z-axis - ranges + Gnuplot& set_zrange(const double iFrom, + const double iTo); + /// autoscale axis (set by default) of xaxis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& set_xautoscale(){cmd("set xrange restore");cmd("set autoscale x");return *this;}; + + // ----------------------------------------------- + /// autoscale axis (set by default) of yaxis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& set_yautoscale(){cmd("set yrange restore");cmd("set autoscale y");return *this;}; + + // ----------------------------------------------- + /// autoscale axis (set by default) of zaxis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& set_zautoscale(){cmd("set zrange restore");cmd("set autoscale z");return *this;}; + + + /// turns on/off log scaling for the specified xaxis (logscale is not set by default) + Gnuplot& set_xlogscale(const double base = 10); + /// turns on/off log scaling for the specified yaxis (logscale is not set by default) + Gnuplot& set_ylogscale(const double base = 10); + /// turns on/off log scaling for the specified zaxis (logscale is not set by default) + Gnuplot& set_zlogscale(const double base = 10); + + // ----------------------------------------------- + /// turns off log scaling for the x axis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& unset_xlogscale(){cmd("unset logscale x"); return *this;}; + + // ----------------------------------------------- + /// turns off log scaling for the y axis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& unset_ylogscale(){cmd("unset logscale y"); return *this;}; + + // ----------------------------------------------- + /// turns off log scaling for the z axis + /// + /// \param --- + /// + /// \return <-- reference to the gnuplot object + // ----------------------------------------------- + inline Gnuplot& unset_zlogscale(){cmd("unset logscale z"); return *this;}; + + + /// set palette range (autoscale by default) + Gnuplot& set_cbrange(const double iFrom, const double iTo); + + + //---------------------------------------------------------------------------------- + // plot + + /// plot a single std::vector: x + /// from file + Gnuplot& plotfile_x(const std::string &filename, + const unsigned int column = 1, + const std::string &title = ""); + /// from std::vector + template + Gnuplot& plot_x(const X& x, const std::string &title = ""); + + + /// plot x,y pairs: x y + /// from file + Gnuplot& plotfile_xy(const std::string &filename, + const unsigned int column_x = 1, + const unsigned int column_y = 2, + const std::string &title = ""); + /// from data + template + Gnuplot& plot_xy(const X& x, const Y& y, const std::string &title = ""); + + + /// plot x,y pairs with dy errorbars: x y dy + /// from file + Gnuplot& plotfile_xy_err(const std::string &filename, + const unsigned int column_x = 1, + const unsigned int column_y = 2, + const unsigned int column_dy = 3, + const std::string &title = ""); + /// from data + template + Gnuplot& plot_xy_err(const X &x, const Y &y, const E &dy, + const std::string &title = ""); + + + /// plot x,y,z triples: x y z + /// from file + Gnuplot& plotfile_xyz(const std::string &filename, + const unsigned int column_x = 1, + const unsigned int column_y = 2, + const unsigned int column_z = 3, + const std::string &title = ""); + /// from std::vector + template + Gnuplot& plot_xyz(const X &x, + const Y &y, + const Z &z, + const std::string &title = ""); + + + + /// plot an equation of the form: y = ax + b, you supply a and b + Gnuplot& plot_slope(const double a, + const double b, + const std::string &title = ""); + + + /// plot an equation supplied as a std::string y=f(x), write only the function f(x) not y= + /// the independent variable has to be x + /// binary operators: ** exponentiation, * multiply, / divide, + add, - substract, % modulo + /// unary operators: - minus, ! factorial + /// elementary functions: rand(x), abs(x), sgn(x), ceil(x), floor(x), int(x), imag(x), real(x), arg(x), + /// sqrt(x), exp(x), log(x), log10(x), sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), + /// sinh(x), cosh(x), tanh(x), asinh(x), acosh(x), atanh(x) + /// special functions: erf(x), erfc(x), inverf(x), gamma(x), igamma(a,x), lgamma(x), ibeta(p,q,x), + /// besj0(x), besj1(x), besy0(x), besy1(x), lambertw(x) + /// statistical fuctions: norm(x), invnorm(x) + Gnuplot& plot_equation(const std::string &equation, + const std::string &title = ""); + + /// plot an equation supplied as a std::string z=f(x,y), write only the function f(x,y) not z= + /// the independent variables have to be x and y + Gnuplot& plot_equation3d(const std::string &equation, + const std::string &title = ""); + + + /// plot image + Gnuplot& plot_image(const unsigned char *ucPicBuf, + const unsigned int iWidth, + const unsigned int iHeight, + const std::string &title = ""); + + + //---------------------------------------------------------------------------------- + ///\brief replot repeats the last plot or splot command. + /// this can be useful for viewing a plot with different set options, + /// or when generating the same plot for several devices (showonscreen, savetops) + /// + /// \param --- + /// + /// \return --- + //---------------------------------------------------------------------------------- + inline Gnuplot& replot(void){if (nplots > 0) cmd("replot");return *this;}; + + /// resets a gnuplot session (next plot will erase previous ones) + Gnuplot& reset_plot(); + + /// resets a gnuplot session and sets all variables to default + Gnuplot& reset_all(); + + /// deletes temporary files + void remove_tmpfiles(); + + // ------------------------------------------------------------------- + /// \brief Is the gnuplot session valid ?? + /// + /// + /// \param --- + /// + /// \return true if valid, false if not + // ------------------------------------------------------------------- + inline bool is_valid(){return(valid);}; + +}; + +//------------------------------------------------------------------------------ +// +// initialize static data +// +int Gnuplot::tmpfile_num = 0; + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) +std::string Gnuplot::m_sGNUPlotFileName = "pgnuplot.exe"; +std::string Gnuplot::m_sGNUPlotPath = "C:/program files/gnuplot/bin/"; +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) +std::string Gnuplot::m_sGNUPlotFileName = "gnuplot"; +std::string Gnuplot::m_sGNUPlotPath = "/usr/local/bin/"; +#endif + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) +std::string Gnuplot::terminal_std = "windows"; +#elif ( defined(unix) || defined(__unix) || defined(__unix__) ) && !defined(__APPLE__) +std::string Gnuplot::terminal_std = "x11"; +#elif defined(__APPLE__) +std::string Gnuplot::terminal_std = "aqua"; +#endif + +//------------------------------------------------------------------------------ +// +// constructor: set a style during construction +// +inline Gnuplot::Gnuplot(const std::string &style) + :gnucmd(NULL) ,valid(false) ,two_dim(false) ,nplots(0) + +{ + init(); + set_style(style); +} + +//------------------------------------------------------------------------------ +// +// constructor: open a new session, plot a signal (x) +// +inline Gnuplot::Gnuplot(const std::vector &x, + const std::string &title, + const std::string &style, + const std::string &labelx, + const std::string &labely) + :gnucmd(NULL) ,valid(false) ,two_dim(false) ,nplots(0) +{ + init(); + + set_style(style); + set_xlabel(labelx); + set_ylabel(labely); + + plot_x(x,title); +} + + +//------------------------------------------------------------------------------ +// +// constructor: open a new session, plot a signal (x,y) +// +inline Gnuplot::Gnuplot(const std::vector &x, + const std::vector &y, + const std::string &title, + const std::string &style, + const std::string &labelx, + const std::string &labely) + :gnucmd(NULL) ,valid(false) ,two_dim(false) ,nplots(0) +{ + init(); + + set_style(style); + set_xlabel(labelx); + set_ylabel(labely); + + plot_xy(x,y,title); +} + + +//------------------------------------------------------------------------------ +// +// constructor: open a new session, plot a signal (x,y,z) +// +inline Gnuplot::Gnuplot(const std::vector &x, + const std::vector &y, + const std::vector &z, + const std::string &title, + const std::string &style, + const std::string &labelx, + const std::string &labely, + const std::string &labelz) + :gnucmd(NULL) ,valid(false) ,two_dim(false) ,nplots(0) +{ + init(); + + set_style(style); + set_xlabel(labelx); + set_ylabel(labely); + set_zlabel(labelz); + + plot_xyz(x,y,z,title); +} + + +//------------------------------------------------------------------------------ +// +/// Plots a 2d graph from a list of doubles: x +// +template +Gnuplot& Gnuplot::plot_x(const X& x, const std::string &title) +{ + if (x.size() == 0) + { + throw GnuplotException("std::vector too small"); + return *this; + } + + std::ofstream tmp; + std::string name = create_tmpfile(tmp); + if (name == "") + return *this; + + // + // write the data to file + // + for (unsigned int i = 0; i < x.size(); i++) + tmp << x[i] << std::endl; + + tmp.flush(); + tmp.close(); + + + plotfile_x(name, 1, title); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +/// Plots a 2d graph from a list of doubles: x y +// +template +Gnuplot& Gnuplot::plot_xy(const X& x, const Y& y, const std::string &title) +{ + if (x.size() == 0 || y.size() == 0) + { + throw GnuplotException("std::vectors too small"); + return *this; + } + + if (x.size() != y.size()) + { + throw GnuplotException("Length of the std::vectors differs"); + return *this; + } + + + std::ofstream tmp; + std::string name = create_tmpfile(tmp); + if (name == "") + return *this; + + // + // write the data to file + // + for (unsigned int i = 0; i < x.size(); i++) + tmp << x[i] << " " << y[i] << std::endl; + + tmp.flush(); + tmp.close(); + + + plotfile_xy(name, 1, 2, title); + + return *this; +} + +///----------------------------------------------------------------------------- +/// +/// plot x,y pairs with dy errorbars +/// +template +Gnuplot& Gnuplot::plot_xy_err(const X &x, + const Y &y, + const E &dy, + const std::string &title) +{ + if (x.size() == 0 || y.size() == 0 || dy.size() == 0) + { + throw GnuplotException("std::vectors too small"); + return *this; + } + + if (x.size() != y.size() || y.size() != dy.size()) + { + throw GnuplotException("Length of the std::vectors differs"); + return *this; + } + + + std::ofstream tmp; + std::string name = create_tmpfile(tmp); + if (name == "") + return *this; + + // + // write the data to file + // + for (unsigned int i = 0; i < x.size(); i++) + tmp << x[i] << " " << y[i] << " " << dy[i] << std::endl; + + tmp.flush(); + tmp.close(); + + + // Do the actual plot + plotfile_xy_err(name, 1, 2, 3, title); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// Plots a 3d graph from a list of doubles: x y z +// +template +Gnuplot& Gnuplot::plot_xyz(const X &x, + const Y &y, + const Z &z, + const std::string &title) +{ + if (x.size() == 0 || y.size() == 0 || z.size() == 0) + { + throw GnuplotException("std::vectors too small"); + return *this; + } + + if (x.size() != y.size() || x.size() != z.size()) + { + throw GnuplotException("Length of the std::vectors differs"); + return *this; + } + + + std::ofstream tmp; + std::string name = create_tmpfile(tmp); + if (name == "") + return *this; + + // + // write the data to file + // + for (unsigned int i = 0; i < x.size(); i++) + tmp << x[i] << " " << y[i] << " " << z[i] < +void stringtok (Container &container, + std::string const &in, + const char * const delimiters = " \t\n") +{ + const std::string::size_type len = in.length(); + std::string::size_type i = 0; + + while ( i < len ) + { + // eat leading whitespace + i = in.find_first_not_of (delimiters, i); + + if (i == std::string::npos) + return; // nothing left but white space + + // find the end of the token + std::string::size_type j = in.find_first_of (delimiters, i); + + // push token + if (j == std::string::npos) + { + container.push_back (in.substr(i)); + return; + } + else + container.push_back (in.substr(i, j-i)); + + // set up for next loop + i = j + 1; + } + + return; +} + + +//------------------------------------------------------------------------------ +// +// Destructor: needed to delete temporary files +// +Gnuplot::~Gnuplot() +{ +// remove_tmpfiles(); + + // A stream opened by popen() should be closed by pclose() +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + if (_pclose(gnucmd) == -1) +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + if (pclose(gnucmd) == -1) +#endif + throw GnuplotException("Problem closing communication to gnuplot"); +} + + +//------------------------------------------------------------------------------ +// +// Resets a gnuplot session (next plot will erase previous ones) +// +Gnuplot& Gnuplot::reset_plot() +{ +// remove_tmpfiles(); + + nplots = 0; + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// resets a gnuplot session and sets all varibles to default +// +Gnuplot& Gnuplot::reset_all() +{ +// remove_tmpfiles(); + + nplots = 0; + cmd("reset"); + cmd("clear"); + pstyle = "points"; + smooth = ""; + showonscreen(); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// Change the plotting style of a gnuplot session +// +Gnuplot& Gnuplot::set_style(const std::string &stylestr) +{ + if (stylestr.find("lines") == std::string::npos && + stylestr.find("points") == std::string::npos && + stylestr.find("linespoints") == std::string::npos && + stylestr.find("impulses") == std::string::npos && + stylestr.find("dots") == std::string::npos && + stylestr.find("steps") == std::string::npos && + stylestr.find("fsteps") == std::string::npos && + stylestr.find("histeps") == std::string::npos && + stylestr.find("boxes") == std::string::npos && // 1-4 columns of data are required + stylestr.find("filledcurves") == std::string::npos && + stylestr.find("histograms") == std::string::npos ) //only for one data column +// stylestr.find("labels") == std::string::npos && // 3 columns of data are required +// stylestr.find("xerrorbars") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("xerrorlines") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("errorbars") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("errorlines") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("yerrorbars") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("yerrorlines") == std::string::npos && // 3-4 columns of data are required +// stylestr.find("boxerrorbars") == std::string::npos && // 3-5 columns of data are required +// stylestr.find("xyerrorbars") == std::string::npos && // 4,6,7 columns of data are required +// stylestr.find("xyerrorlines") == std::string::npos && // 4,6,7 columns of data are required +// stylestr.find("boxxyerrorbars") == std::string::npos && // 4,6,7 columns of data are required +// stylestr.find("financebars") == std::string::npos && // 5 columns of data are required +// stylestr.find("candlesticks") == std::string::npos && // 5 columns of data are required +// stylestr.find("vectors") == std::string::npos && +// stylestr.find("image") == std::string::npos && +// stylestr.find("rgbimage") == std::string::npos && +// stylestr.find("pm3d") == std::string::npos ) + { + pstyle = std::string("points"); + } + else + { + pstyle = stylestr; + } + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// smooth: interpolation and approximation of data +// +Gnuplot& Gnuplot::set_smooth(const std::string &stylestr) +{ + if (stylestr.find("unique") == std::string::npos && + stylestr.find("frequency") == std::string::npos && + stylestr.find("csplines") == std::string::npos && + stylestr.find("acsplines") == std::string::npos && + stylestr.find("bezier") == std::string::npos && + stylestr.find("sbezier") == std::string::npos ) + { + smooth = ""; + } + else + { + smooth = stylestr; + } + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// sets terminal type to windows / x11 +// +Gnuplot& Gnuplot::showonscreen() +{ + cmd("set output"); + cmd("set terminal " + Gnuplot::terminal_std); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// saves a gnuplot session to a postscript file +// +Gnuplot& Gnuplot::savetops(const std::string &filename) +{ + cmd("set terminal postscript color"); + + std::ostringstream cmdstr; + cmdstr << "set output \"" << filename << ".ps\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// saves a gnuplot session to a PNG file +// +Gnuplot& Gnuplot::savetopng(const std::string &filename) +{ + cmd("set terminal png"); + + std::ostringstream cmdstr; + cmdstr << "set output \"" << filename << ".png\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// saves a gnuplot session to a PDF file +// +Gnuplot& Gnuplot::savetopdf(const std::string &filename) +{ + cmd("set terminal pdf color"); + + std::ostringstream cmdstr; + cmdstr << "set output \"" << filename << ".pdf\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// Switches legend on +// +Gnuplot& Gnuplot::set_legend(const std::string &position) +{ + std::ostringstream cmdstr; + cmdstr << "set key " << position; + + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// turns on log scaling for the x axis +// +Gnuplot& Gnuplot::set_xlogscale(const double base) +{ + std::ostringstream cmdstr; + + cmdstr << "set logscale x " << base; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// turns on log scaling for the y axis +// +Gnuplot& Gnuplot::set_ylogscale(const double base) +{ + std::ostringstream cmdstr; + + cmdstr << "set logscale y " << base; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// turns on log scaling for the z axis +// +Gnuplot& Gnuplot::set_zlogscale(const double base) +{ + std::ostringstream cmdstr; + + cmdstr << "set logscale z " << base; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// scales the size of the points used in plots +// +Gnuplot& Gnuplot::set_pointsize(const double pointsize) +{ + std::ostringstream cmdstr; + cmdstr << "set pointsize " << pointsize; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// set isoline density (grid) for plotting functions as surfaces +// +Gnuplot& Gnuplot::set_samples(const int samples) +{ + std::ostringstream cmdstr; + cmdstr << "set samples " << samples; + cmd(cmdstr.str()); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// set isoline density (grid) for plotting functions as surfaces +// +Gnuplot& Gnuplot::set_isosamples(const int isolines) +{ + std::ostringstream cmdstr; + cmdstr << "set isosamples " << isolines; + cmd(cmdstr.str()); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// enables contour drawing for surfaces set contour {base | surface | both} +// + +Gnuplot& Gnuplot::set_contour(const std::string &position) +{ + if (position.find("base") == std::string::npos && + position.find("surface") == std::string::npos && + position.find("both") == std::string::npos ) + { + cmd("set contour base"); + } + else + { + cmd("set contour " + position); + } + + return *this; +} + +//------------------------------------------------------------------------------ +// +// set labels +// +// set the xlabel +Gnuplot& Gnuplot::set_xlabel(const std::string &label) +{ + std::ostringstream cmdstr; + + cmdstr << "set xlabel \"" << label << "\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// set the ylabel +// +Gnuplot& Gnuplot::set_ylabel(const std::string &label) +{ + std::ostringstream cmdstr; + + cmdstr << "set ylabel \"" << label << "\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// set the zlabel +// +Gnuplot& Gnuplot::set_zlabel(const std::string &label) +{ + std::ostringstream cmdstr; + + cmdstr << "set zlabel \"" << label << "\""; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// set range +// +// set the xrange +Gnuplot& Gnuplot::set_xrange(const double iFrom, + const double iTo) +{ + std::ostringstream cmdstr; + + cmdstr << "set xrange[" << iFrom << ":" << iTo << "]"; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// set the yrange +// +Gnuplot& Gnuplot::set_yrange(const double iFrom, + const double iTo) +{ + std::ostringstream cmdstr; + + cmdstr << "set yrange[" << iFrom << ":" << iTo << "]"; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// set the zrange +// +Gnuplot& Gnuplot::set_zrange(const double iFrom, + const double iTo) +{ + std::ostringstream cmdstr; + + cmdstr << "set zrange[" << iFrom << ":" << iTo << "]"; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// set the palette range +// +Gnuplot& Gnuplot::set_cbrange(const double iFrom, + const double iTo) +{ + std::ostringstream cmdstr; + + cmdstr << "set cbrange[" << iFrom << ":" << iTo << "]"; + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// Plots a linear equation y=ax+b (where you supply the +// slope a and intercept b) +// +Gnuplot& Gnuplot::plot_slope(const double a, + const double b, + const std::string &title) +{ + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + cmdstr << a << " * x + " << b << " title \""; + + if (title == "") + cmdstr << "f(x) = " << a << " * x + " << b; + else + cmdstr << title; + + cmdstr << "\" with " << pstyle; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// Plot an equation supplied as a std::string y=f(x) (only f(x) expected) +// +Gnuplot& Gnuplot::plot_equation(const std::string &equation, + const std::string &title) +{ + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + cmdstr << equation << " title \""; + + if (title == "") + cmdstr << "f(x) = " << equation; + else + cmdstr << title; + + cmdstr << "\" with " << pstyle; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + +//------------------------------------------------------------------------------ +// +// plot an equation supplied as a std::string y=(x) +// +Gnuplot& Gnuplot::plot_equation3d(const std::string &equation, + const std::string &title) +{ + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == false) + cmdstr << "replot "; + else + cmdstr << "splot "; + + cmdstr << equation << " title \""; + + if (title == "") + cmdstr << "f(x,y) = " << equation; + else + cmdstr << title; + + cmdstr << "\" with " << pstyle; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// Plots a 2d graph from a list of doubles (x) saved in a file +// +Gnuplot& Gnuplot::plotfile_x(const std::string &filename, + const unsigned int column, + const std::string &title) +{ + // + // check if file exists + // + file_available(filename); + + + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + cmdstr << "\"" << filename << "\" using " << column; + + if (title == "") + cmdstr << " notitle "; + else + cmdstr << " title \"" << title << "\" "; + + if(smooth == "") + cmdstr << "with " << pstyle; + else + cmdstr << "smooth " << smooth; + + // + // Do the actual plot + // + cmd(cmdstr.str()); //nplots++; two_dim = true; already in cmd(); + + return *this; +} + + + +//------------------------------------------------------------------------------ +// +// Plots a 2d graph from a list of doubles (x y) saved in a file +// +Gnuplot& Gnuplot::plotfile_xy(const std::string &filename, + const unsigned int column_x, + const unsigned int column_y, + const std::string &title) +{ + // + // check if file exists + // + file_available(filename); + + + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + cmdstr << "\"" << filename << "\" using " << column_x << ":" << column_y; + + if (title == "") + cmdstr << " notitle "; + else + cmdstr << " title \"" << title << "\" "; + + if(smooth == "") + cmdstr << "with " << pstyle; + else + cmdstr << "smooth " << smooth; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// Plots a 2d graph with errorbars from a list of doubles (x y dy) in a file +// +Gnuplot& Gnuplot::plotfile_xy_err(const std::string &filename, + const unsigned int column_x, + const unsigned int column_y, + const unsigned int column_dy, + const std::string &title) +{ + // + // check if file exists + // + file_available(filename); + + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + cmdstr << "\"" << filename << "\" using " + << column_x << ":" << column_y << ":" << column_dy + << " with errorbars "; + + if (title == "") + cmdstr << " notitle "; + else + cmdstr << " title \"" << title << "\" "; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + + +//------------------------------------------------------------------------------ +// +// Plots a 3d graph from a list of doubles (x y z) saved in a file +// +Gnuplot& Gnuplot::plotfile_xyz(const std::string &filename, + const unsigned int column_x, + const unsigned int column_y, + const unsigned int column_z, + const std::string &title) +{ + // + // check if file exists + // + file_available(filename); + + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == false) + cmdstr << "replot "; + else + cmdstr << "splot "; + + cmdstr << "\"" << filename << "\" using " << column_x << ":" << column_y + << ":" << column_z; + + if (title == "") + cmdstr << " notitle with " << pstyle; + else + cmdstr << " title \"" << title << "\" with " << pstyle; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + + + +//------------------------------------------------------------------------------ +// +/// * note that this function is not valid for versions of GNUPlot below 4.2 +// +Gnuplot& Gnuplot::plot_image(const unsigned char * ucPicBuf, + const unsigned int iWidth, + const unsigned int iHeight, + const std::string &title) +{ + std::ofstream tmp; + std::string name = create_tmpfile(tmp); + if (name == "") + return *this; + + // + // write the data to file + // + int iIndex = 0; + for(int iRow = 0; iRow < iHeight; iRow++) + { + for(int iColumn = 0; iColumn < iWidth; iColumn++) + { + tmp << iColumn << " " << iRow << " " + << static_cast(ucPicBuf[iIndex++]) << std::endl; + } + } + + tmp.flush(); + tmp.close(); + + + std::ostringstream cmdstr; + // + // command to be sent to gnuplot + // + if (nplots > 0 && two_dim == true) + cmdstr << "replot "; + else + cmdstr << "plot "; + + if (title == "") + cmdstr << "\"" << name << "\" with image"; + else + cmdstr << "\"" << name << "\" title \"" << title << "\" with image"; + + // + // Do the actual plot + // + cmd(cmdstr.str()); + + return *this; +} + + + +//------------------------------------------------------------------------------ +// +// Sends a command to an active gnuplot session +// +Gnuplot& Gnuplot::cmd(const std::string &cmdstr) +{ + if( !(valid) ) + { + return *this; + } + + + // int fputs ( const char * str, FILE * stream ); + // writes the string str to the stream. + // The function begins copying from the address specified (str) until it + // reaches the terminating null character ('\0'). This final + // null-character is not copied to the stream. + fputs( (cmdstr+"\n").c_str(), gnucmd ); + + // int fflush ( FILE * stream ); + // If the given stream was open for writing and the last i/o operation was + // an output operation, any unwritten data in the output buffer is written + // to the file. If the argument is a null pointer, all open files are + // flushed. The stream remains open after this call. + fflush(gnucmd); + + + if( cmdstr.find("replot") != std::string::npos ) + { + return *this; + } + else if( cmdstr.find("splot") != std::string::npos ) + { + two_dim = false; + nplots++; + } + else if( cmdstr.find("plot") != std::string::npos ) + { + two_dim = true; + nplots++; + } + + return *this; +} + + + +//------------------------------------------------------------------------------ +// +// Opens up a gnuplot session, ready to receive commands +// +void Gnuplot::init() +{ + // char * getenv ( const char * name ); get value of environment variable + // Retrieves a C string containing the value of the environment variable + // whose name is specified as argument. If the requested variable is not + // part of the environment list, the function returns a NULL pointer. +#if ( defined(unix) || defined(__unix) || defined(__unix__) ) && !defined(__APPLE__) + if (getenv("DISPLAY") == NULL) + { + valid = false; + throw GnuplotException("Can't find DISPLAY variable"); + } +#endif + + + // if gnuplot not available + if (!Gnuplot::get_program_path()) + { + valid = false; + throw GnuplotException("Can't find gnuplot"); + } + + + // + // open pipe + // + std::string tmp = Gnuplot::m_sGNUPlotPath + "/" + + Gnuplot::m_sGNUPlotFileName; + + // FILE *popen(const char *command, const char *mode); + // The popen() function shall execute the command specified by the string + // command, create a pipe between the calling program and the executed + // command, and return a pointer to a stream that can be used to either read + // from or write to the pipe. +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + gnucmd = _popen(tmp.c_str(),"w"); +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + gnucmd = popen(tmp.c_str(),"w"); +#endif + + // popen() shall return a pointer to an open stream that can be used to read + // or write to the pipe. Otherwise, it shall return a null pointer and may + // set errno to indicate the error. + if (!gnucmd) + { + valid = false; + throw GnuplotException("Couldn't open connection to gnuplot"); + } + + nplots = 0; + valid = true; + smooth = ""; + + //set terminal type + showonscreen(); + + return; +} + + +//------------------------------------------------------------------------------ +// +// Find out if a command lives in m_sGNUPlotPath or in PATH +// +bool Gnuplot::get_program_path() +{ + // + // first look in m_sGNUPlotPath for Gnuplot + // + std::string tmp = Gnuplot::m_sGNUPlotPath + "/" + + Gnuplot::m_sGNUPlotFileName; + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + if ( Gnuplot::file_exists(tmp,0) ) // check existence +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + if ( Gnuplot::file_exists(tmp,1) ) // check existence and execution permission +#endif + { + return true; + } + + + // + // second look in PATH for Gnuplot + // + char *path; + // Retrieves a C string containing the value of environment variable PATH + path = getenv("PATH"); + + + if (path == NULL) + { + throw GnuplotException("Path is not set"); + return false; + } + else + { + std::list ls; + + //split path (one long string) into list ls of strings +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + stringtok(ls,path,";"); +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + stringtok(ls,path,":"); +#endif + + // scan list for Gnuplot program files + for (std::list::const_iterator i = ls.begin(); + i != ls.end(); ++i) + { + tmp = (*i) + "/" + Gnuplot::m_sGNUPlotFileName; +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + if ( Gnuplot::file_exists(tmp,0) ) // check existence +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + if ( Gnuplot::file_exists(tmp,1) ) // check existence and execution permission +#endif + { + Gnuplot::m_sGNUPlotPath = *i; // set m_sGNUPlotPath + return true; + } + } + + tmp = "Can't find gnuplot neither in PATH nor in \"" + + Gnuplot::m_sGNUPlotPath + "\""; + throw GnuplotException(tmp); + + Gnuplot::m_sGNUPlotPath = ""; + return false; + } +} + + + +//------------------------------------------------------------------------------ +// +// check if file exists +// +bool Gnuplot::file_exists(const std::string &filename, int mode) +{ + if ( mode < 0 || mode > 7) + { + throw std::runtime_error("In function \"Gnuplot::file_exists\": mode\ + has to be an integer between 0 and 7"); + return false; + } + + // int _access(const char *path, int mode); + // returns 0 if the file has the given mode, + // it returns -1 if the named file does not exist or is not accessible in + // the given mode + // mode = 0 (F_OK) (default): checks file for existence only + // mode = 1 (X_OK): execution permission + // mode = 2 (W_OK): write permission + // mode = 4 (R_OK): read permission + // mode = 6 : read and write permission + // mode = 7 : read, write and execution permission +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + if (_access(filename.c_str(), mode) == 0) +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + if (access(filename.c_str(), mode) == 0) +#endif + { + return true; + } + else + { + return false; + } + +} + +bool Gnuplot::file_available(const std::string &filename){ + std::ostringstream except; + if( Gnuplot::file_exists(filename,0) ) // check existence + { + if( !(Gnuplot::file_exists(filename,4)) ){// check read permission + except << "No read permission for File \"" << filename << "\""; + throw GnuplotException( except.str() ); + return false; + } + } + else{ + except << "File \"" << filename << "\" does not exist"; + throw GnuplotException( except.str() ); + return false; + } +} + + + +//------------------------------------------------------------------------------ +// +// Opens a temporary file +// +std::string Gnuplot::create_tmpfile(std::ofstream &tmp) +{ + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + char name[] = "gnuplotiXXXXXX"; //tmp file in working directory +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + char name[] = "/tmp/gnuplotiXXXXXX"; // tmp file in /tmp +#endif + + // + // check if maximum number of temporary files reached + // + if (Gnuplot::tmpfile_num == GP_MAX_TMP_FILES - 1) + { + std::ostringstream except; + except << "Maximum number of temporary files reached (" + << GP_MAX_TMP_FILES << "): cannot open more files" << std::endl; + + throw GnuplotException( except.str() ); + return ""; + } + + // int mkstemp(char *name); + // shall replace the contents of the string pointed to by "name" by a unique + // filename, and return a file descriptor for the file open for reading and + // writing. Otherwise, -1 shall be returned if no suitable file could be + // created. The string in template should look like a filename with six + // trailing 'X' s; mkstemp() replaces each 'X' with a character from the + // portable filename character set. The characters are chosen such that the + // resulting name does not duplicate the name of an existing file at the + // time of a call to mkstemp() + + + // + // open temporary files for output + // +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__TOS_WIN__) + if (_mktemp(name) == NULL) +#elif defined(unix) || defined(__unix) || defined(__unix__) || defined(__APPLE__) + if (mkstemp(name) == -1) +#endif + { + std::ostringstream except; + except << "Cannot create temporary file \"" << name << "\""; + throw GnuplotException(except.str()); + return ""; + } + + tmp.open(name); + if (tmp.bad()) + { + std::ostringstream except; + except << "Cannot create temporary file \"" << name << "\""; + throw GnuplotException(except.str()); + return ""; + } + + // + // Save the temporary filename + // + tmpfile_list.push_back(name); + Gnuplot::tmpfile_num++; + + return name; +} + +void Gnuplot::remove_tmpfiles(){ + if ((tmpfile_list).size() > 0) + { + for (unsigned int i = 0; i < tmpfile_list.size(); i++) + remove( tmpfile_list[i].c_str() ); + + Gnuplot::tmpfile_num -= tmpfile_list.size(); + } +} +#endif diff --git a/src/inc/jellyfish/aligned_values_array.hpp b/src/inc/jellyfish/aligned_values_array.hpp new file mode 100644 index 00000000..0ba9df21 --- /dev/null +++ b/src/inc/jellyfish/aligned_values_array.hpp @@ -0,0 +1,141 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_ALIGNED_VALUE_HPP__ +#define __JELLYFISH_ALIGNED_VALUE_HPP__ + +#include +#include + +namespace jellyfish { + namespace aligned_values { + template + class array : public storage_t { + public: + typedef _key_t key_t; + typedef _val_t val_t; + + private: + typedef typename ::jellyfish::invertible_hash::array key_ary_t; + typedef typename ::jellyfish::direct_indexing::array val_ary_t; + + key_ary_t keys; + val_ary_t vals; + + public: + array(size_t _size, uint_t _key_len, uint_t _reprobe_limit, + size_t *_reprobes) : + keys(_size, _key_len, 0, _reprobe_limit, _reprobes), + vals(keys.get_lsize()) + { } + + array(char *keys_map, char *vals_map, + size_t _size, uint_t _key_len, uint_t _reprobe_limit, + size_t *_reprobes, SquareBinaryMatrix &hash_matrix, + SquareBinaryMatrix &hash_inv_matrix) : + keys(keys_map, _size, _key_len, 0, _reprobe_limit, _reprobes, + hash_matrix, hash_inv_matrix), + vals(vals_map, keys.get_lsize()) + { } + + + void set_matrix(SquareBinaryMatrix &m) { + keys.set_matrix(m); + } + size_t get_size() const { return keys.get_size(); } + uint_t get_key_len() const { return keys.get_key_len(); } + uint_t get_val_len() const { return keys.get_val_len(); } + uint_t get_max_reprobe() const { return keys.get_max_reprobe(); } + size_t get_max_reprobe_offset() const { + return keys.get_max_reprobe_offset(); + } + uint_t get_block_len() const { return keys.get_block_len(); } + uint_t get_block_word_len() const { + return keys.get_block_word_len() + keys.get_block_len() * sizeof(val_t); + } + + size_t floor_block(size_t entries, size_t &blocks) const { + return keys.floor_block(entries, blocks); + } + void zero_keys(const size_t start, const size_t length) { + keys.zero_blocks(start, length); + } + void zero_values(const size_t start, const size_t length) { + vals.zero(start, length); + } + void write_keys_blocks(std::ostream *out, size_t start, size_t length) const { + keys.write_blocks(out, start, length); + } + void write_values(std::ostream *out, size_t start, size_t length) const { + vals.write(out, start, length); + } + void write_matrices(std::ostream *out) { + keys.write_ary_header(out); + } + + template + bool add(key_t key, const add_t &val, val_t *oval = 0) { + bool is_new; + size_t id; + + if(!keys.set(key, &is_new, &id)) + return false; + + vals.add(id, val, oval); + return true; + } + + bool get_val(key_t key, val_t &val, bool full = true) const { + key_t v_ignore; + size_t key_id; + + if(!keys.get_val(key, key_id, v_ignore, false)) + return false; + + vals.get_val(key_id, val); + return true; + } + + class iterator { + typename key_ary_t::iterator key_it; + const val_ary_t *const vals; + + public: + iterator(typename key_ary_t::iterator _key_it, const val_ary_t *_vals) : + key_it(_key_it), vals(_vals) {} + + uint64_t get_hash() const { return key_it.get_hash(); } + uint64_t get_pos() const { return key_it.get_pos(); } + uint64_t get_start() const { return key_it.get_start(); } + uint64_t get_end() const { return key_it.get_end(); } + key_t get_key() const { return key_it.get_key(); } + val_t get_val() const { return (*vals)[get_id()]; } + size_t get_id() const { return key_it.get_id(); } + char *get_dna_str() { return key_it.get_dna_str(); } + bool next() { return key_it.next(); } + }; + iterator iterator_all() const { + return iterator(keys.iterator_all(), &vals); + } + iterator iterator_slice(size_t slice_number, size_t number_of_slice) const { + return iterator(keys.iterator_slice(slice_number, number_of_slice), + &vals); + } + }; + } +} + +#endif diff --git a/src/inc/jellyfish/allocators_mmap.hpp b/src/inc/jellyfish/allocators_mmap.hpp new file mode 100644 index 00000000..8d975cda --- /dev/null +++ b/src/inc/jellyfish/allocators_mmap.hpp @@ -0,0 +1,64 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_ALLOCATORS_MMAP_HPP__ +#define __JELLYFISH_ALLOCATORS_MMAP_HPP__ + +#include +#include +#include +#include +#include + +namespace allocators { + class mmap { + void *ptr; + size_t size; + + public: + mmap() : ptr(MAP_FAILED), size(0) {} + explicit mmap(size_t _size) : ptr(MAP_FAILED), size(0) { + realloc(_size); + fast_zero(); + } + ~mmap() { + if(ptr != MAP_FAILED) + ::munmap(ptr, size); + } + + void *get_ptr() const { return ptr != MAP_FAILED ? ptr : NULL; } + size_t get_size() const { return size; } + void *realloc(size_t new_size); + int lock() { return mlock(ptr, size); } + int unlock() { return munlock(ptr, size); } + + // Return a a number of bytes which is a number of whole pages at + // least as large as size. + static size_t round_to_page(size_t _size); + + private: + static const int nb_threads = 4; + struct tinfo { + pthread_t thid; + char *start, *end; + size_t pgsize; + }; + void fast_zero(); + static void * _fast_zero(void *_info); + }; +} + +#endif diff --git a/src/inc/jellyfish/atomic_gcc.hpp b/src/inc/jellyfish/atomic_gcc.hpp new file mode 100644 index 00000000..aafa9689 --- /dev/null +++ b/src/inc/jellyfish/atomic_gcc.hpp @@ -0,0 +1,68 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_ATOMIC_GCC_HPP__ +#define __JELLYFISH_ATOMIC_GCC_HPP__ + +namespace atomic +{ + class gcc + { + public: + template + static inline T cas(volatile T *ptr, T oval, T nval) { + return __sync_val_compare_and_swap(ptr, oval, nval); + } + + template + static inline T set(T *ptr, T nval) { + return __sync_lock_test_and_set(ptr, nval); + } + + template + static inline T add_fetch(volatile T *ptr, T x) { + T ncount = *ptr, count; + do { + count = ncount; + ncount = cas((T *)ptr, count, count + x); + } while(ncount != count); + return count + x; + } + + template + static inline T fetch_add(volatile T *ptr, T x) { + T ncount = *ptr, count; + do { + count = ncount; + ncount = cas((T *)ptr, count, count + x); + } while(ncount != count); + return count; + } + + template + static inline T set_to_max(volatile T *ptr, T x) { + T count = *ptr; + while(x > count) { + T ncount = cas(ptr, count, x); + if(ncount == count) + return x; + count = ncount; + } + return count; + } + }; +} +#endif diff --git a/src/inc/jellyfish/capped_integer.hpp b/src/inc/jellyfish/capped_integer.hpp new file mode 100644 index 00000000..fa0b994d --- /dev/null +++ b/src/inc/jellyfish/capped_integer.hpp @@ -0,0 +1,64 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_CAPPED_INTEGER_HPP__ +#define __JELLYFISH_CAPPED_INTEGER_HPP__ + +#include + +namespace jellyfish { + template class capped_integer; + template + std::ostream &operator<<(std::ostream &os, const capped_integer &i); + + template + class capped_integer { + T x; + + public: + typedef T bits_t; + static const T cap = (T)-1; + + capped_integer() : x(0) {} + explicit capped_integer(bits_t _x) : x(_x) {} + + static const capped_integer zero; + static const capped_integer one; + + const capped_integer operator+(const capped_integer y) const { + return capped_integer((y.x > ~x) ? cap : y.x + x); + } + const capped_integer operator+(const T& y) const { + return capped_integer((y > ~x) ? cap : y + x); + } + + bits_t bits() const { return x; } + float to_float() const { return (float)x; } + + bool operator==(const capped_integer &o) { return x == o.x; } + bool operator!() const { return x == 0; } + + friend std::ostream &operator<< <> (std::ostream &os, + const capped_integer &i); + }; + + template + std::ostream &operator<<(std::ostream &os, const capped_integer &i) { + return os << i.x; + } +} + +#endif diff --git a/src/inc/jellyfish/circular_buffer.hpp b/src/inc/jellyfish/circular_buffer.hpp new file mode 100644 index 00000000..860135cd --- /dev/null +++ b/src/inc/jellyfish/circular_buffer.hpp @@ -0,0 +1,68 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_CIRCULAR_BUFFER_HPP__ +#define __JELLYFISH_CIRCULAR_BUFFER_HPP__ + +#include + +namespace jellyfish { + template + class circular_buffer { + const int size; + T *buffer, *end; + T *start; + + public: + explicit circular_buffer(int _size) : size(_size) { + buffer = new T[size]; + end = buffer + size; + start = buffer; + } + + ~circular_buffer() { + delete [] buffer; + } + + void append(const T v) { + // std::cerr << "append buffer " << (void *)buffer << " end " << (void *)end << " start " << (void *)start << " val " << v << "\n"; + *start++ = v; + + if(start == end) + start = buffer; + } + + template + T op(U o) const { + T *c = start; + T acc = *c++; + if(c == end) + c = buffer; + + do { + acc = o(acc, *c++); + if(c == end) + c = buffer; + } while(c != start); + return acc; + } + + static T T_times(T &x, T &y) { return x * y; } + T prod() const { return op(T_times); } + }; +} + +#endif diff --git a/src/inc/jellyfish/compacted_dumper.hpp b/src/inc/jellyfish/compacted_dumper.hpp new file mode 100644 index 00000000..5b3dbe6b --- /dev/null +++ b/src/inc/jellyfish/compacted_dumper.hpp @@ -0,0 +1,171 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#include + +namespace jellyfish { + template + class compacted_dumper : public dumper_t { + define_error_class(ErrorWriting); + typedef typename storage_t::iterator iterator; + typedef compacted_hash::writer writer_t; + struct thread_info_t { + pthread_t thread_id; + uint_t id; + locks::pthread::cond cond; + volatile bool token; + writer_t writer; + compacted_dumper *self; + }; + + uint_t threads; + std::string file_prefix; + size_t buffer_size; + uint_t klen, vlen; + uint_t key_len, val_len; + size_t record_len, nb_records, nb_blocks; + storage_t *ary; + uint_t file_index; + struct thread_info_t *thread_info; + uint64_t max_count; + uint64_t volatile unique, distinct, total; + std::ofstream out; + + public: + // klen: key field length in bits in hash (i.e before rounding up to bytes) + // vlen: value field length in bits + compacted_dumper(uint_t _threads, const char *_file_prefix, size_t _buffer_size, + uint_t _vlen, storage_t *_ary) : + threads(_threads), file_prefix(_file_prefix), buffer_size(_buffer_size), + klen(_ary->get_key_len()), vlen(_vlen), ary(_ary), file_index(0) + { + std::cerr << "Compacted dumper init" << std::endl; + key_len = bits_to_bytes(klen); + val_len = bits_to_bytes(vlen); + max_count = (((uint64_t)1) << (8*val_len)) - 1; + record_len = key_len + val_len; + nb_records = ary->floor_block(_buffer_size / record_len, nb_blocks); + + thread_info = new struct thread_info_t[threads]; + for(uint_t i = 0; i < threads; i++) { + thread_info[i].token = i == 0; + thread_info[i].writer.initialize(nb_records, ary->get_key_len(), vlen, ary); + thread_info[i].id = i; + thread_info[i].self = this; + } + unique = distinct = total = 0; + } + + ~compacted_dumper() { + if(thread_info) { + delete[] thread_info; + } + } + + static void *dump_to_file_thread(void *arg) { + struct thread_info_t *info = (struct thread_info_t *)arg; + info->self->dump_to_file(info); + return NULL; + } + + void dump_to_file(struct thread_info_t *my_info); + + virtual void dump(); + void update_stats() { + thread_info[0].writer.update_stats_with(&out, unique, distinct, total); + } + }; + + template + void compacted_dumper::dump() { + static const long file_len = pathconf("/", _PC_PATH_MAX); + std::cerr << "dump()" << std::endl; + char file[file_len + 1]; + file[file_len] = '\0'; + int off = snprintf(file, file_len, "%s", file_prefix.c_str()); + if(off < 0) + eraise(ErrorWriting) << "Error creating output path" << err::no; + if(off > 0 && off < file_len) { + int _off = snprintf(file + off, file_len - off, "_%uld", file_index++); + if(_off < 0) + eraise(ErrorWriting) << "Error creating output path" << err::no; + off += _off; + } + if(off >= file_len) + eraise(ErrorWriting) << "File path is too long"; + + + // out.exceptions(std::ios::eofbit|std::ios::failbit|std::ios::badbit); + std::cerr << "Open " << file << std::endl; + out.open(file); + if(!out.good()) + eraise(ErrorWriting) << "'" << file << "': " + << "Can't open file for writing" << err::no; + + + out.write("JFLISTDN", 8); + unique = distinct = total = 0; + for(uint_t i = 0; i < threads; i++) + thread_info[i].token = i == 0; + for(uint_t i = 0; i < threads; i++) { + pthread_create(&thread_info[i].thread_id, NULL, dump_to_file_thread, + &thread_info[i]); + } + + for(uint_t i = 0; i < threads; i++) + pthread_join(thread_info[i].thread_id, NULL); + update_stats(); + out.close(); + } + + template + void compacted_dumper::dump_to_file(struct thread_info_t *my_info) { + size_t i; + struct thread_info_t *next_info = &thread_info[(my_info->id + 1) % threads]; + atomic_t atomic; + + if(my_info->token) + my_info->writer.write_header(&out); + + for(i = my_info->id; i * nb_records < ary->get_size(); i += threads) { + // fill up buffer + iterator it(ary, i * nb_records, (i + 1) * nb_records); + + while(it.next()) { + my_info->writer.append(it.key, it.val); + } + + // wait for token & write buffer + my_info->cond.lock(); + while(!my_info->token) { my_info->cond.wait(); } + my_info->cond.unlock(); + my_info->writer.dump(&out); + + // pass on token + my_info->token = false; + next_info->cond.lock(); + next_info->token = true; + next_info->cond.signal(); + next_info->cond.unlock(); + + // zero out memory + ary->zero_blocks(i * nb_blocks, nb_blocks); + } + atomic.add_fetch(&unique, my_info->writer.get_unique()); + atomic.add_fetch(&distinct, my_info->writer.get_distinct()); + atomic.add_fetch(&total, my_info->writer.get_total()); + } +} diff --git a/src/inc/jellyfish/compacted_hash.hpp b/src/inc/jellyfish/compacted_hash.hpp new file mode 100644 index 00000000..75c7bd08 --- /dev/null +++ b/src/inc/jellyfish/compacted_hash.hpp @@ -0,0 +1,530 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_COMPACTED_HASH__ +#define __JELLYFISH_COMPACTED_HASH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + namespace compacted_hash { + define_error_class(ErrorReading); + + static const char *file_type = "JFLISTDN"; + struct header { + char type[8]; // type of file. Expect file_type + uint64_t key_len; + uint64_t val_len; // In bytes + uint64_t size; // In bytes + uint64_t max_reprobe; + uint64_t unique; + uint64_t distinct; + uint64_t total; + uint64_t max_count; + + header() { } + explicit header(char *ptr) { + if(memcmp(ptr, file_type, sizeof(type))) + eraise(ErrorReading) << "Bad file type '" << err::substr(ptr, sizeof(type)) + << "', expected '" << err::substr(file_type, sizeof(type)) << "'"; + memcpy((void *)this, ptr, sizeof(struct header)); + } + }; + + template + class writer { + uint64_t unique, distinct, total, max_count; + size_t nb_records; + uint_t klen, vlen; + uint_t key_len, val_len; + storage_t *ary; + char *buffer, *end, *ptr; + + public: + writer() : unique(0), distinct(0), total(0), max_count(0) + { buffer = ptr = end = NULL; } + + writer(size_t _nb_records, uint_t _klen, uint_t _vlen, storage_t *_ary) + { + initialize(_nb_records, _klen, _vlen, _ary); + } + + void initialize(size_t _nb_records, uint_t _klen, uint_t _vlen, storage_t *_ary) { + unique = distinct = total = max_count = 0; + nb_records = _nb_records; + klen = _klen; + vlen = _vlen; + key_len = bits_to_bytes(klen); + val_len = bits_to_bytes(vlen); + ary = _ary; + buffer = new char[nb_records * (key_len + val_len)]; + end = buffer + (nb_records * (key_len + val_len)); + ptr = buffer; + } + + ~writer() { + if(buffer) + delete buffer; + } + + bool append(uint64_t key, uint64_t val) { + if(ptr >= end) + return false; + memcpy(ptr, &key, key_len); + ptr += key_len; + memcpy(ptr, &val, val_len); + ptr += val_len; + unique += val == 1; + distinct++; + total += val; + if(val > max_count) + max_count = val; + return true; + } + + void dump(std::ostream *out) { + out->write(buffer, ptr - buffer); + ptr = buffer; + } + + void write_header(std::ostream *out) const { + struct header head; + memset(&head, '\0', sizeof(head)); + memcpy(&head.type, file_type, sizeof(head.type)); + head.key_len = klen; + head.val_len = val_len; + head.size = ary->get_size(); + head.max_reprobe = ary->get_max_reprobe_offset(); + out->write((char *)&head, sizeof(head)); + ary->write_ary_header(out); + } + + void update_stats(std::ostream *out) const { + update_stats_with(out, unique, distinct, total, max_count); + } + + void update_stats_with(std::ostream *out, uint64_t _unique, uint64_t _distinct, + uint64_t _total, uint64_t _max_count) const { + if(!out->good()) + return; + out->seekp(0); + if(!out->good()) { + out->clear(); + return; + } + + struct header head; + memcpy(&head.type, file_type, sizeof(head.type)); + head.key_len = klen; + head.val_len = val_len; + head.size = ary->get_size(); + head.max_reprobe = ary->get_max_reprobe_offset(); + head.unique = _unique; + head.distinct = _distinct; + head.total = _total; + head.max_count = _max_count; + out->write((char *)&head, sizeof(head)); + } + + uint64_t get_unique() const { return unique; } + uint64_t get_distinct() const { return distinct; } + uint64_t get_total() const { return total; } + uint64_t get_max_count() const { return max_count; } + uint_t get_key_len_bytes() const { return key_len; } + uint_t get_val_len_bytes() const { return val_len; } + + void reset_counters() { + unique = distinct = total = max_count = 0; + } + }; + + template + class reader { + struct header header; + std::ifstream *io; + uint_t key_len; + SquareBinaryMatrix hash_matrix, hash_inverse_matrix; + size_t record_len, buffer_len; + size_t size_mask; + char *buffer, *end_buffer, *ptr; + char dna_str[33]; + + public: + key_t key; + val_t val; + + reader() { io = 0; buffer = 0; memset(dna_str, '\0', sizeof(dna_str)); } + explicit reader(std::string filename, size_t _buff_len = 10000000UL) { + initialize(filename, _buff_len); + } + + void initialize(std::string filename, size_t _buff_len) { + memset(dna_str, '\0', sizeof(dna_str)); + io = new std::ifstream(filename.c_str()); + io->read((char *)&header, sizeof(header)); + if(!io->good()) + eraise(ErrorReading) << "'" << filename << "': " + << "File truncated"; + if(memcmp(header.type, file_type, sizeof(header.type))) + eraise(ErrorReading) << "'" << filename << "': " + << "Bad file type '" + << err::substr(header.type, sizeof(header.type)) << "', expected '" + << err::substr(file_type, sizeof(header.type)) << "'"; + + if(header.key_len > 64 || header.key_len == 0) + eraise(ErrorReading) << "'" << filename << "': " + << "Invalid key length '" + << header.key_len << "'"; + if(header.size != (1UL << floorLog2(header.size))) + eraise(ErrorReading) << "'" << filename << "': " + << "Size '" << header.size + << "' is not a power of 2"; + key_len = (header.key_len / 8) + (header.key_len % 8 != 0); + record_len = key_len + header.val_len; + buffer_len = record_len * (_buff_len / record_len); + buffer = new char[buffer_len]; + ptr = buffer; + end_buffer = NULL; + + hash_matrix.load(io); + hash_inverse_matrix.load(io); + + if(header.distinct != 0) { + std::streamoff list_size = get_file_size(*io); + if(list_size != (std::streamoff)-1 && + list_size - (header.distinct * record_len) != 0) { + eraise(ErrorReading) << "'" << filename << "': " + << "Bad hash size '" << list_size + << "', expected '" + << (header.distinct * record_len) << "' bytes"; + } + } + key = val = 0; + size_mask = header.size - 1; + } + + ~reader() { + if(io) + delete io; + if(buffer) + delete[] buffer; + } + + uint_t get_key_len() const { return header.key_len; } + uint_t get_mer_len() const { return header.key_len / 2; } + uint_t get_val_len() const { return header.val_len; } + size_t get_size() const { return header.size; } + uint64_t get_max_reprobe() const { return header.max_reprobe; } + uint64_t get_max_reprobe_offset() const { return header.max_reprobe; } + uint64_t get_unique() const { return header.unique; } + uint64_t get_distinct() const { return header.distinct; } + uint64_t get_total() const { return header.total; } + uint64_t get_max_count() const { return header.max_count; } + SquareBinaryMatrix get_hash_matrix() const { return hash_matrix; } + SquareBinaryMatrix get_hash_inverse_matrix() const { return hash_inverse_matrix; } + void write_ary_header(std::ostream *out) const { + hash_matrix.dump(out); + hash_inverse_matrix.dump(out); + } + + key_t get_key() const { return key; } + val_t get_val() const { return val; } + + + void get_string(char *out) const { + parse_dna::mer_binary_to_string(key, get_mer_len(), out); + } + char* get_dna_str() { + parse_dna::mer_binary_to_string(key, get_mer_len(), dna_str); + return dna_str; + } + uint64_t get_hash() const { return hash_matrix.times(key); } + uint64_t get_pos() const { return hash_matrix.times(key) & size_mask; } + + bool next() { + while(true) { + if(ptr <= end_buffer) { + memcpy(&key, ptr, key_len); + ptr += key_len; + memcpy(&val, ptr, header.val_len); + ptr += header.val_len; + return true; + } + + if(io->fail()) + return false; + io->read(buffer, buffer_len); + // if(record_len * (io->gcount() / record_len) != io->gcount()) + // return false; + ptr = buffer; + end_buffer = NULL; + if((size_t)io->gcount() >= record_len) + end_buffer = ptr + (io->gcount() - record_len); + } + } + }; + + template + class query { + mapped_file file; + struct header header; + uint_t key_len; + uint_t val_len; + uint_t record_len; + SquareBinaryMatrix hash_matrix; + SquareBinaryMatrix hash_inverse_matrix; + char *base; + uint64_t size; + uint64_t size_mask; + uint64_t last_id; + key_t first_key, last_key; + uint64_t first_pos, last_pos; + bool canonical; + + public: + /* Can't wait for C++0x to be finalized and call constructor + from constructor! + */ + explicit query(mapped_file &map) : + file(map), + header(file.base()), + key_len((header.key_len / 8) + (header.key_len % 8 != 0)), + val_len(header.val_len), + record_len(key_len + header.val_len), + hash_matrix(file.base() + sizeof(header)), + hash_inverse_matrix(file.base() + sizeof(header) + hash_matrix.dump_size()), + base(file.base() + sizeof(header) + hash_matrix.dump_size() + hash_inverse_matrix.dump_size()), + size(header.size), + size_mask(header.size - 1), + last_id((file.end() - base) / record_len), + canonical(false) + { + if(header.distinct != 0 && file.end() - base - header.distinct * record_len != 0) + eraise(ErrorReading) << "'" << file.path() << "': " + << "Bad hash size '" << (file.end() - base) + << "', expected '" << header.distinct * record_len << "' bytes"; + + get_key(0, &first_key); + first_pos = get_pos(first_key); + get_key(last_id - 1, &last_key); + last_pos = get_pos(last_key); + } + explicit query(std::string filename) : + file(filename.c_str()), + header(file.base()), + key_len((header.key_len / 8) + (header.key_len % 8 != 0)), + val_len(header.val_len), + record_len(key_len + header.val_len), + hash_matrix(file.base() + sizeof(header)), + hash_inverse_matrix(file.base() + sizeof(header) + hash_matrix.dump_size()), + base(file.base() + sizeof(header) + hash_matrix.dump_size() + hash_inverse_matrix.dump_size()), + size(header.size), + size_mask(header.size - 1), + last_id((file.end() - base) / record_len), + canonical(false) + { + if(header.distinct != 0 && file.end() - base - header.distinct * record_len != 0) + eraise(ErrorReading) << "'" << file.path() << "': " + << "Bad hash size '" << (file.end() - base) + << "', expected '" << header.distinct * record_len << "' bytes"; + + get_key(0, &first_key); + first_pos = get_pos(first_key); + get_key(last_id - 1, &last_key); + last_pos = get_pos(last_key); + } + + uint_t get_key_len() const { return header.key_len; } + uint_t get_mer_len() const { return header.key_len / 2; } + uint_t get_val_len() const { return header.val_len; } + size_t get_size() const { return header.size; } + size_t get_nb_mers() const { return last_id; } + uint64_t get_max_reprobe() const { return header.max_reprobe; } + uint64_t get_max_reprobe_offset() const { return header.max_reprobe; } + uint64_t get_unique() const { return header.unique; } + uint64_t get_distinct() const { return header.distinct; } + uint64_t get_total() const { return header.total; } + uint64_t get_max_count() const { return header.max_count; } + SquareBinaryMatrix get_hash_matrix() const { return hash_matrix; } + SquareBinaryMatrix get_hash_inverse_matrix() const { return hash_inverse_matrix; } + bool get_canonical() const { return canonical; } + void set_canonical(bool v) { canonical = v; } + + /* No check is made on the validity of the string passed. Should only contained [acgtACGT] to get a valid answer. + */ + val_t operator[] (const char *key_s) const { + return get_key_val(parse_dna::mer_string_to_binary(key_s, get_mer_len())); + } + val_t operator[] (const key_t key) const { return get_key_val(key); } + + void get_key(size_t id, key_t *k) const { + *k = 0; + memcpy(k, base + id * record_len, key_len); + } + void get_val(size_t id, val_t *v) const { + *v = 0; + memcpy(v, base + id * record_len + key_len, val_len); + } + uint64_t get_pos(key_t k) const { + return hash_matrix.times(k) & size_mask; + } + + val_t get_key_val(const key_t key) const { + uint64_t id; + val_t res; + if(get_key_val_id(key, &res, &id)) + return res; + else + return 0; + } + + bool get_key_val_id(const key_t _key, val_t *res, + uint64_t *id) const { + key_t key; + if(canonical) { + key = parse_dna::reverse_complement(_key, get_mer_len()); + if(key > _key) + key = _key; + } else { + key = _key; + } + if(key == first_key) { + get_val(0, res); + *id = 0; + return true; + } + if(key == last_key) { + get_val(last_id - 1, res); + *id = last_id; + return true; + } + uint64_t pos = get_pos(key); + if(pos < first_pos || pos > last_pos) + return false; + uint64_t first = 0, last = last_id; + while(first < last - 1) { + uint64_t middle = (first + last) / 2; + key_t mid_key; + get_key(middle, &mid_key); + // printf("%ld %ld %ld %ld %ld %ld %ld\n", key, pos, first, middle, last, mid_key, get_pos(mid_key)); + if(key == mid_key) { + get_val(middle, res); + *id = middle; + return true; + } + uint64_t mid_pos = get_pos(mid_key); + if(mid_pos > pos || (mid_pos == pos && mid_key > key)) + last = middle; + else + first = middle; + } + return false; + } + + class iterator { + char *base, *ptr; + uint64_t last_id; + uint_t key_len; + uint_t val_len; + uint_t record_len; + uint_t mer_len; + uint64_t id; + key_t key; + val_t val; + char dna_str[33]; + + public: + iterator(char *_base, uint64_t _last_id, uint_t _key_len, uint_t _val_len, uint_t _mer_len) : + base(_base), ptr(_base), last_id(_last_id), key_len(_key_len), val_len(_val_len), + record_len(key_len + val_len), mer_len(_mer_len), id(0), key(0), val(0) + { + memset(dna_str, '\0', sizeof(dna_str)); + } + + key_t get_key() const { return key; } + val_t get_val() const { return val; } + uint64_t get_id() const { return id; } + + bool next() { + if(id >= last_id) + return false; + ++id; + memcpy(&key, ptr, key_len); + ptr += key_len; + memcpy(&val, ptr, val_len); + ptr += val_len; + return true; + } + + bool next(uint64_t *_id, key_t *_key, val_t *_val) { + if(id >= last_id) + return false; + *_id = atomic::gcc::add_fetch(&id, (uint64_t)1) - 1; + if(*_id >= last_id) + return false; + char *ptr = base + (*_id) * record_len; + *_key = 0; + memcpy(_key, ptr, key_len); + ptr += key_len; + *_val = 0; + memcpy(_val, ptr, val_len); + return true; + } + + inline bool next(key_t *_key, val_t *_val) { + uint64_t _id; + return next(&_id, _key, _val); + } + + char *get_dna_str() { + parse_dna::mer_binary_to_string(key, mer_len, dna_str); + return dna_str; + } + + void get_dna_str(char *out) { + parse_dna::mer_binary_to_string(key, mer_len, out); + } + }; + + iterator get_iterator() const { return iterator_all(); } + iterator iterator_all() const { return iterator(base, last_id, key_len, val_len, get_mer_len()); } + iterator iterator_slice(uint64_t slice_number, uint64_t number_of_slice) const { + std::pair res = + slice(slice_number, number_of_slice, last_id); + char *it_base = base + res.first * record_len; + uint64_t it_last_id = res.second - res.first; + + if(it_base >= file.end()) { + it_base = base; + it_last_id = 0; + } else if(it_base + it_last_id * record_len > file.end()) + it_last_id = (file.end() - it_base) / record_len; + + return iterator(it_base, it_last_id, key_len, val_len, get_mer_len()); + } + }; + } +} +#endif /* __COMPACTED_HASH__ */ diff --git a/src/inc/jellyfish/concurrent_queues.hpp b/src/inc/jellyfish/concurrent_queues.hpp new file mode 100644 index 00000000..1bac387f --- /dev/null +++ b/src/inc/jellyfish/concurrent_queues.hpp @@ -0,0 +1,161 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_CONCURRENT_QUEUES_HPP__ +#define __JELLYFISH_CONCURRENT_QUEUES_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/*** + * Circular buffer of fixed size with thread safe enqueue and dequeue + * operation to make it behave like a FIFO. Elements are enqueued at + * the head and dequeued at the tail. Never more than n elements + * should be enqueued if the size is n+1. There is no check for this. + * + * It is possible for the tail pointer to go past an element (i.e. it + * has been "dequeued"), but the thread is slow to zero the pointer + * (i.e. to claim the element). It is then possible for the head + * pointer to point to this not yet claimed element. The enqueue() + * method blindly skip over such an element. Hence, it is possible + * that the same element will be dequeued again before it is + * claimed. Or, it will be claimed after being skipped and another + * thread will dequeue what looks like an empty element. The outer + * loop of dequeue() handles this situation. + */ + +namespace jellyfish { + template + class concurrent_queue { + Val **queue; + const uint64_t size; + uint64_t volatile head; + uint64_t volatile tail; + bool volatile closed; + divisor64 size_div; + + public: + explicit concurrent_queue(uint64_t _size) : + size(20 *_size), head(0), tail(0), closed(false), size_div(size) + { + queue = new Val *[size]; + memset(queue, 0, sizeof(Val *) * size); + } + ~concurrent_queue() { delete [] queue; } + + void enqueue(Val *v); + Val *dequeue(); + bool is_closed() { return closed; } + void close() { closed = true; __sync_synchronize(); } + bool has_space() { return head != tail; } + bool is_low() { + uint64_t ctail = tail; + __sync_synchronize(); + uint64_t chead = head; + int64_t len = chead - ctail; + if(len < 0) + len += size; + return (uint64_t)(4*len) <= size; + } + uint64_t usage() { + uint64_t ctail = tail; + __sync_synchronize(); + uint64_t chead = head; + int64_t len = chead - ctail; + if(len < 0) + len += size; + return len; + } + }; + + template + void concurrent_queue::enqueue(Val *v) { + int done = 0; + uint64_t chead; + + chead = head; + do { + // uint64_t q, nhead; + uint64_t nhead = (chead + 1) % size_div; + // size_div.division(chead + 1, q, nhead); + // uint64_t nhead = (chead + 1) % size; + + done = (atomic::gcc::cas(&queue[chead], (Val*)0, v) == (Val*)0); + chead = atomic::gcc::cas(&head, chead, nhead); + } while(!done); + + assert(head < size); + assert(tail < size); + } + + template + Val *concurrent_queue::dequeue() { + bool done = false; + Val *res; + uint64_t ctail, ntail; + + ctail = tail; + // __sync_synchronize(); + do { + bool dequeued = false; + do { + // if(ctail == head) + // return NULL; + + // Complicated way to do ctail == head. Is it necessary? Or is + // the memory barrier above sufficient? Or even necessary? + if(atomic::gcc::cas(&head, ctail, ctail) == ctail) { + assert(head < size); + assert(tail < size); + return NULL; + } + // ntail = (ctail + 1) % size; + // uint64_t q; + // size_div.division(ctail + 1, q, ntail); + ntail = (ctail + 1) % size_div; + ntail = atomic::gcc::cas(&tail, ctail, ntail); + dequeued = ntail == ctail; + ctail = ntail; + } while(!dequeued); + + // Claim dequeued slot. We may have dequeued an element which is + // empty or that another thread also has dequeued but not yet + // claimed. This can happen if a thread is slow to claim (set + // pointer to 0) and the enqueue method has queued elements past + // this one. + res = queue[ctail]; + if(res) + done = atomic::gcc::cas(&queue[ctail], res, (Val*)0) == res; + } while(!done); + + assert(head < size); + assert(tail < size); + + return res; + } +} + +#endif diff --git a/src/inc/jellyfish/counter.hpp b/src/inc/jellyfish/counter.hpp new file mode 100644 index 00000000..83f0c56f --- /dev/null +++ b/src/inc/jellyfish/counter.hpp @@ -0,0 +1,52 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#include + +class counter_t { + volatile uint64_t count; + +public: + counter_t() : count(0) {} + + inline uint64_t operator++(int) { + return atomic::gcc::fetch_add(&count, (uint64_t)1); + } + inline uint64_t inc(uint64_t x) { + return atomic::gcc::fetch_add(&count, x); + } + inline uint64_t get() const { return count; } + + class block { + counter_t *c; + uint64_t bs; + uint64_t base, i; + + // friend counter_t; + public: + block(counter_t *_c, uint64_t _bs) : c(_c), bs(_bs), base(0), i(bs) {} + + public: + inline uint64_t operator++(int) { + if(i >= bs) { + i = 0; + base = c->inc(bs); + } + return base + i++; + } + }; + block get_block(uint64_t bs = 100) { return block(this, bs); } +}; diff --git a/src/inc/jellyfish/dbg.hpp b/src/inc/jellyfish/dbg.hpp new file mode 100644 index 00000000..d894696b --- /dev/null +++ b/src/inc/jellyfish/dbg.hpp @@ -0,0 +1,153 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __DBG_HPP__ +#define __DBG_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace dbg { + pid_t gettid(); + + class stringbuf : public std::stringbuf { + public: + stringbuf() : std::stringbuf(std::ios_base::out) { } + explicit stringbuf(const std::string &str) : + std::stringbuf(str, std::ios_base::out) { } + + bool end_is_space() { + if(pptr() == pbase()) + return true; + return isspace(*(pptr() - 1)); + } + friend class print_t; + }; + + class str { + const char *_s; + const size_t _l; + public: + str(const char *s, size_t len) : _s(s), _l(len) {} + friend class print_t; + }; + + class xspace { }; + class no_flush { }; + + class print_t { + static pthread_mutex_t _lock; + static volatile pid_t _print_tid; + + stringbuf _strbuf; + std::ostream _buf; + bool _flush; + public: + print_t(const char *file, const char *function, int line) : + _buf(&_strbuf), _flush(true) + { + const char *file_basename = strrchr(file, '/'); + if(!file_basename) + file_basename = file; + _buf << pthread_self() << "/" << gettid() << ":" + << file_basename << ":" << function << ":" << line << ": "; + } + + ~print_t() { + if(_print_tid == 0 || gettid() == _print_tid) { + pthread_mutex_lock(&_lock); + std::cerr.write(_strbuf.pbase(), _strbuf.pptr() - _strbuf.pbase()); + if(_flush) + std::cerr << std::endl; + else + std::cerr << "\n"; + pthread_mutex_unlock(&_lock); + } + } + + static int set_signal(int signum = SIGUSR1); + static void signal_handler(int signum, siginfo_t *info, void *context); + static pid_t print_tid() { return _print_tid; } + static void print_tid(pid_t new_tid) { _print_tid = new_tid; } + + print_t & operator<<(const char *a[]) { + for(int i = 0; a[i]; i++) + _buf << (i ? "\n" : "") << a[i]; + return *this; + } + print_t & operator<<(const std::exception &e) { + _buf << e.what(); + return *this; + } + print_t & operator<<(const str &ss) { + _buf.write(ss._s, ss._l); + return *this; + } + print_t & operator<<(const xspace &xs) { + if(!_strbuf.end_is_space()) + _buf << " "; + return *this; + } + print_t &operator<<(const no_flush &nf) { + _flush = false; + return *this; + } + print_t & operator<<(const Time &t) { + _buf << t.str(); + return *this; + } + template + print_t & operator<<(const T &x) { + _buf << x; + return *this; + } + }; + + class no_print_t { + public: + no_print_t() {} + + template + no_print_t & operator<<(const T &x) { return *this; } + }; + + void tic(); + Time toc(); +} + +#ifdef DEBUG +#define DBG if(1) dbg::print_t(__FILE__, __FUNCTION__, __LINE__) +#define NFDBG if(1) dbg::print_t(__FILE__, __FUNCTION__, __LINE__) << dbg::no_flush() +#define V(v) dbg::xspace() << #v ":" << v +#else +#define DBG if(1) dbg::no_print_t() +#define NFDBG if(1) dbg::no_print_t() +#define V(v) v +#endif + +#endif /* __DBG_HPP__ */ diff --git a/src/inc/jellyfish/direct_indexing_array.hpp b/src/inc/jellyfish/direct_indexing_array.hpp new file mode 100644 index 00000000..d6b4e9c7 --- /dev/null +++ b/src/inc/jellyfish/direct_indexing_array.hpp @@ -0,0 +1,151 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_DIRECT_INDEXING_ARRAY_HPP__ +#define __JELLYFISH_DIRECT_INDEXING_ARRAY_HPP__ + +namespace jellyfish { + namespace direct_indexing { + template + class array { + public: + typedef typename val_t::bits_t bits_t; + + uint_t key_len; + size_t size; + mem_block_t mem_block; + bits_t *data; + atomic_t atomic; + + + public: + explicit array(uint_t _key_len) : + key_len(_key_len), size(((size_t)1) << key_len), + mem_block(size * sizeof(bits_t)), + data((bits_t *)mem_block.get_ptr()) + { } + + array(char *map, uint_t _key_len) : + key_len(_key_len), size(((size_t)1) << key_len), + data((bits_t *)map) + { } + + size_t get_size() const { return size; } + uint_t get_key_len() const { return key_len; } + uint_t get_val_len() const { return sizeof(bits_t); } + size_t get_max_reprobe_offset() const { return 1; } + + void write_ary_header(std::ostream *out) const { + SquareBinaryMatrix id(key_len); + id.init_identity(); + id.dump(out); + id.dump(out); + } + void write_raw(std::ostream *out) const {} + + template + bool add(key_t key, const add_t &val, val_t *_oval = 0) { + bits_t oval = data[key]; + val_t nval = val_t(oval) + val; + + while(true) { + bits_t noval = atomic.cas(&data[key], oval, nval.bits()); + if(noval == oval) { + if(_oval) + *_oval = val_t(oval); + return true; + } + oval = noval; + nval = val_t(oval) + val; + } + return true; + } + + bool get_val(key_t key, val_t &val, bool full = true) const { + val = data[key]; + return true; + } + + class iterator { + const array *ary; + size_t start_id; + size_t nid; + size_t end_id; + key_t key; + bits_t val; + size_t id; + + public: + iterator(const array *_ary, size_t start, size_t end) : + ary(_ary), start_id(start), nid(start), + end_id(end > ary->get_size() ? ary->get_size() : end) + {} + + void get_string(char *out) const { + parse_dna::mer_binary_to_string(key, ary->get_key_len() / 2, out); + } + uint64_t get_hash() const { return key; } + uint64_t get_pos() const { return key; } + uint64_t get_start() const { return start_id; } + uint64_t get_end() const { return end_id; } + key_t get_key() const { return key; } + val_t get_val() const { return val_t(val); } + size_t get_id() const { return id; } + + bool next() { + while((id = nid) < end_id) { + nid++; + val = ary->data[id]; + if(val) { + key = id; + return true; + } + } + return false; + } + }; + friend class iterator; + iterator iterator_all() const { return iterator(this, 0, get_size()); } + iterator iterator_slice(size_t slice_number, size_t number_of_slice) const { + std::pair res = slice(slice_number, number_of_slice, get_size()); + return iterator(this, res.first, res.second); + } + + /** + * Zero out entries in [start, start+length). + */ + void zero(size_t start, size_t length) { + if(start >= size) + return; + if(start + length > size) + length = size - start; + memset(data + start, '\0', length * sizeof(*data)); + } + + void write(std::ostream *out, const size_t start, size_t length) const { + if(start >= size) + return; + if(start + length > size) + length = size - start; + out->write((char *)(data + start), length * sizeof(*data)); + } + + val_t operator[](key_t key) const { return val_t(data[key]); } + }; + } +} + +#endif diff --git a/src/inc/jellyfish/direct_sorted_dumper.hpp b/src/inc/jellyfish/direct_sorted_dumper.hpp new file mode 100644 index 00000000..163bd7f5 --- /dev/null +++ b/src/inc/jellyfish/direct_sorted_dumper.hpp @@ -0,0 +1,131 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#include +#include +#include +#include + +namespace jellyfish { + template + class direct_sorted_dumper : public dumper_t, public thread_exec { + typedef typename storage_t::iterator iterator; + typedef typename compacted_hash::writer writer_t; + typedef token_ring token_ring_t; + + struct thread_info_t { + writer_t writer; + token_ring_t::token *token; + }; + + uint_t threads; + const char *file_prefix; + size_t buffer_size; + uint_t klen, vlen; + uint_t key_len, val_len; + size_t record_len, nb_records; + storage_t *ary; + int file_index; + token_ring_t tr; + uint64_t lower_count, upper_count; + struct thread_info_t *thread_info; + uint64_t volatile unique, distinct, total, max_count; + std::ofstream *out; + bool one_file; + + public: + direct_sorted_dumper(uint_t _threads, const char *_file_prefix, + size_t _buffer_size, uint_t _vlen, storage_t *_ary) : + threads(_threads), file_prefix(_file_prefix), buffer_size(_buffer_size), + klen(_ary->get_key_len()), vlen(_vlen), ary(_ary), + tr() , lower_count(0), upper_count(std::numeric_limits::max()), + one_file(false) + { + key_len = bits_to_bytes(klen); + val_len = bits_to_bytes(vlen); + record_len = key_len + val_len; + nb_records = _buffer_size / record_len; + thread_info = new struct thread_info_t[threads]; + for(uint_t i = 0; i < threads; i++) { + thread_info[i].writer.initialize(nb_records, klen, vlen, ary); + thread_info[i].token = tr.new_token(); + } + unique = distinct = total = max_count = 0; + } + + ~direct_sorted_dumper() { + if(thread_info) + delete[] thread_info; + } + + bool get_one_file() const { return one_file; } + void set_one_file(bool nv) { one_file = nv; } + + void set_lower_count(uint64_t l) { lower_count = l; } + void set_upper_count(uint64_t u) { upper_count = u; } + + virtual void start(int i) { dump_to_file(i); } + void dump_to_file(int i); + + virtual void _dump(); + void update_stats() { + thread_info[0].writer.update_stats_with(out, unique, distinct, total, + max_count); + } + }; + + template + void direct_sorted_dumper::_dump() { + std::ofstream _out; + if(one_file) { + _out.open(file_prefix); + } else { + open_next_file(file_prefix, &file_index, _out); + } + out = &_out; + unique = distinct = total = max_count = 0; + tr.reset(); + thread_info[0].writer.write_header(out); + exec_join(threads); + update_stats(); + _out.close(); + } + + template + void direct_sorted_dumper::dump_to_file(int id) { + size_t i; + struct thread_info_t *my_info = &thread_info[id]; + atomic_t atomic; + + my_info->writer.reset_counters(); + + for(i = id; i * nb_records < ary->get_size(); i += threads) { + iterator it(ary, i * nb_records, (i + 1) * nb_records); + while(it.next()) + if(it.get_val().bits() >= lower_count && it.get_val().bits() <= upper_count) + my_info->writer.append(it.get_key(), it.get_val().bits()); + + my_info->token->wait(); + my_info->writer.dump(out); + my_info->token->pass(); + ary->zero(i * nb_records, nb_records); + } + atomic.add_fetch(&unique, my_info->writer.get_unique()); + atomic.add_fetch(&distinct, my_info->writer.get_distinct()); + atomic.add_fetch(&total, my_info->writer.get_total()); + atomic.set_to_max(&max_count, my_info->writer.get_max_count()); + } +} diff --git a/src/inc/jellyfish/divisor.hpp b/src/inc/jellyfish/divisor.hpp new file mode 100644 index 00000000..9ef45cbe --- /dev/null +++ b/src/inc/jellyfish/divisor.hpp @@ -0,0 +1,135 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_DIVISOR_HPP__ +#define __JELLYFISH_DIVISOR_HPP__ + +#include +#include +#ifdef HAVE_CONFIG_H +#include +#endif + +namespace jellyfish { + class divisor64 { + const uint64_t d_; +#ifdef HAVE_INT128 + const uint16_t p_; + const unsigned __int128 m_; +#endif + + public: + explicit divisor64(uint64_t d) : + d_(d) +#ifdef HAVE_INT128 + , p_(ceilLog2(d_)), + m_((div_ceil((unsigned __int128)1 << (64 + p_), (unsigned __int128)d_)) & (uint64_t)-1) +#endif + { } + + divisor64() : + d_(0) +#ifdef HAVE_INT128 + , p_(0), m_(0) +#endif + { } + + explicit divisor64(const divisor64& rhs) : + d_(rhs.d_) +#ifdef HAVE_INT128 + , p_(rhs.p_), + m_(rhs.m_) +#endif + { } + + inline uint64_t divide(const uint64_t n) const { +#ifdef HAVE_INT128 + switch(m_) { + case 0: + return n >> p_; + default: + const unsigned __int128 n_ = (unsigned __int128)n; + return (n_ + ((n_ * m_) >> 64)) >> p_; + } +#else + return n / d_; +#endif + } + + inline uint64_t remainder(uint64_t n) const { +#ifdef HAVE_INT128 + switch(m_) { + case 0: + return n & (((uint64_t)1 << p_) - 1); + default: + return n - divide(n) * d_; + } +#else + return n % d_; +#endif + } + + // Euclidian division: d.division(n, q, r) sets q <- n / d and r + // <- n % d. This is faster than doing each independently. + inline void division(uint64_t n, uint64_t &q, uint64_t &r) const { +#ifdef HAVE_INT128 + switch(m_) { + case 0: + q = n >> p_; + r = n & (((uint64_t)1 << p_) - 1); + break; + default: + q = divide(n); + r = n - q * d_; + break; + } +#else + q = n / d_; + r = n % d_; +#endif + } + + uint64_t d() const { return d_; } + uint64_t p() const { +#ifdef HAVE_INT128 + return p_; +#else + return 0; +#endif + } + uint64_t m() const { +#ifdef HAVE_INT128 + return m_; +#else + return 0; +#endif + } + }; + + inline uint64_t operator/(uint64_t n, const divisor64& d) { + return d.divide(n); + } + inline uint64_t operator%(uint64_t n, const divisor64& d) { + return d.remainder(n); + } +} + +inline std::ostream& operator<<(std::ostream& os, const jellyfish::divisor64& d) { + return os << "d:" << d.d() << ",p:" << d.p() << ",m:" << d.m(); +} + +#endif /* __JELLYFISH_DIVISOR_HPP__ */ + diff --git a/src/inc/jellyfish/dna_codes.hpp b/src/inc/jellyfish/dna_codes.hpp new file mode 100644 index 00000000..80e92cdd --- /dev/null +++ b/src/inc/jellyfish/dna_codes.hpp @@ -0,0 +1,20 @@ +#ifndef __DNA_CODE_HPP__ +#define __DNA_CODE_HPP__ + +#include +#include + +namespace jellyfish { +static const uint_t CODE_A = 0; +static const uint_t CODE_C = 0; +static const uint_t CODE_G = 0; +static const uint_t CODE_T = 0; +// Non DNA codes have the MSB on +static const uint_t CODE_RESET = (uint_t)-1; +static const uint_t CODE_IGNORE = (uint_t)-2; +static const uint_t CODE_COMMENT = (uint_t)-3; +static const uint_t CODE_NOT_DNA = ((uint_t)1) << (bsizeof(uint_t) - 1); +extern const char dna_codes[256]; +}; + +#endif /* __DNA_CODE_HPP__ */ diff --git a/src/inc/jellyfish/double_fifo_input.hpp b/src/inc/jellyfish/double_fifo_input.hpp new file mode 100644 index 00000000..19fa2b87 --- /dev/null +++ b/src/inc/jellyfish/double_fifo_input.hpp @@ -0,0 +1,192 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_DOUBLE_FIFO_INPUT__ +#define __JELLYFISH_DOUBLE_FIFO_INPUT__ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace jellyfish { + /* Double lock free fifo containing elements of type T. + * + * The input thread, created by this class, runs the virtual + * function 'fill' to fill up tockens from the wq (to be Written + * tocken Queue) and append then into the rq (to be Read tocken + * Queue). + * + * next() returns a pointer to a filled tocken. If the queue is + * empty, it will sleep some (start with 1/100th +/- 50%, and + * exponential back off) to give time for the queue to fill up. + */ + template + class double_fifo_input { + define_error_class(Error); + typedef concurrent_queue queue; + + // The only transitions are: + // WORKING -> SLEEPING -> WAKENING -> WORKING + enum state_t { WORKING, SLEEPING, WAKENING }; + + queue rq, wq; + T * buckets; + const unsigned long nb_buckets; + state_t volatile state; + pthread_t input_id; + locks::pthread::cond full_queue; + + static void *static_input_routine(void *arg); + void input_routine(); + + public: + typedef T bucket_t; + explicit double_fifo_input(unsigned long _nb_buckets); + virtual ~double_fifo_input(); + + virtual void fill() = 0; + T *next(); + void release(T *bucket); + bool is_closed() { return rq.is_closed(); } + + typedef T *bucket_iterator; + bucket_iterator bucket_begin() const { return buckets; } + bucket_iterator bucket_end() const { return buckets + nb_buckets; } + + protected: + // Get bucket to fill and release. + T *write_next(); + void write_release(T *bucket); + void close() { rq.close(); } + + private: + // Wake up input thread if it was sleeping. Returns previous + // state. + state_t input_wake(); + }; + + /****/ + + template + double_fifo_input::double_fifo_input(unsigned long _nb_buckets) : + rq(_nb_buckets), wq(_nb_buckets), nb_buckets(_nb_buckets), state(WORKING), + input_id(0) + { + buckets = new T[nb_buckets]; + + for(unsigned long i = 0; i < nb_buckets; ++i) + wq.enqueue(&buckets[i]); + + if(pthread_create(&input_id, 0, static_input_routine, (void *)this) != 0) + eraise(Error) << "Failed creating input thread" << err::no; + } + + template + double_fifo_input::~double_fifo_input() { + if(input_id) + if(pthread_cancel(input_id)) { + void *input_return; + pthread_join(input_id, &input_return); + } + delete [] buckets; + } + + template + void *double_fifo_input::static_input_routine(void *arg) { + double_fifo_input *o = (double_fifo_input *)arg; + o->input_routine(); + return 0; + } + + template + void double_fifo_input::input_routine() { + state_t prev_state; + + while(!rq.is_closed()) { + // The write queue is full or this is the first iteration, sleep + // until it become less than some threshold + full_queue.lock(); + prev_state = atomic::gcc::cas(&state, WORKING, SLEEPING); + assert(prev_state == WORKING); + do { + full_queue.wait(); + } while(state != WAKENING); + prev_state = atomic::gcc::cas(&state, WAKENING, WORKING); + assert(prev_state == WAKENING); + full_queue.unlock(); + + fill(); + } + } + + template + typename double_fifo_input::state_t double_fifo_input::input_wake() { + state_t prev_state = atomic::gcc::cas(&state, SLEEPING, WAKENING); + assert(prev_state >= WORKING && prev_state <= WAKENING); + if(prev_state == SLEEPING) { + full_queue.lock(); + full_queue.signal(); + full_queue.unlock(); + } + return prev_state; + } + + template + T *double_fifo_input::next() { + if(rq.is_low()) // && !rq.is_closed()) + input_wake(); + + T *res = 0; + while(!(res = rq.dequeue())) { + if(rq.is_closed()) + return 0; + input_wake(); + // TODO Should we wait on a lock instead when the input thread is + // already in working state (i.e. it is most likely blocked on + // some I/O). + static struct timespec time_sleep = { 0, 10000000 }; + nanosleep(&time_sleep, NULL); + } + + return res; + } + + template + void double_fifo_input::release(T *bucket) { + assert(bucket - buckets >= 0 && (unsigned long)(bucket - buckets) < nb_buckets); + wq.enqueue(bucket); + } + + template + T *double_fifo_input::write_next() { + return wq.dequeue(); + } + + template + void double_fifo_input::write_release(T *bucket) { + assert(bucket - buckets >= 0 && (unsigned long)(bucket - buckets) < nb_buckets); + rq.enqueue(bucket); + } +} + +#endif diff --git a/src/inc/jellyfish/dumper.hpp b/src/inc/jellyfish/dumper.hpp new file mode 100644 index 00000000..47b80980 --- /dev/null +++ b/src/inc/jellyfish/dumper.hpp @@ -0,0 +1,74 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_DUMPER_HPP__ +#define __JELLYFISH_DUMPER_HPP__ + +#include +#include +#include +#include + +/** + * A dumper is responsible to dump the hash array to permanent storage + * and zero out the array. + **/ +namespace jellyfish { + class dumper_t { + Time writing_time; + public: + define_error_class(ErrorWriting); + + protected: + void open_next_file(const char *prefix, int *index, std::ofstream &out) { + static const long file_len = pathconf("/", _PC_PATH_MAX); + + char file[file_len + 1]; + file[file_len] = '\0'; + int off = snprintf(file, file_len, "%s", prefix); + if(off < 0) + eraise(ErrorWriting) << "Error creating output path" << err::no; + if(off > 0 && off < file_len) { + int eindex = atomic::gcc::fetch_add(index, (int)1); + int _off = snprintf(file + off, file_len - off, "_%d", eindex); + if(_off < 0) + eraise(ErrorWriting) << "Error creating output path" << err::no; + off += _off; + } + if(off >= file_len) + eraise(ErrorWriting) << "Output path is longer than maximum path length (" + << off << " > " << file_len << ")"; + + out.open(file); + if(out.fail()) + eraise(ErrorWriting) << "'" << (char*)file << "': " + << "Can't open file for writing" << err::no; + } + + public: + dumper_t() : writing_time(::Time::zero) {} + void dump() { + Time start; + _dump(); + Time end; + writing_time += end - start; + } + virtual void _dump() = 0; + Time get_writing_time() const { return writing_time; } + virtual ~dumper_t() {}; + }; +} +#endif // __DUMPER_HPP__ diff --git a/src/inc/jellyfish/err.hpp b/src/inc/jellyfish/err.hpp new file mode 100644 index 00000000..50e4bf99 --- /dev/null +++ b/src/inc/jellyfish/err.hpp @@ -0,0 +1,120 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __ERR_HPP__ +#define __ERR_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace err { + class code { + int _code; + public: + explicit code(int c) : _code(c) {} + int get_code() const { return _code; } + }; + + class no_t { + public: + no_t() {} + static void write(std::ostream &os, int e) { + char err_str[4096]; + strerror_r(e, err_str, sizeof(err_str)); + os << ": " << err_str; + } + }; + static const no_t no; + std::ostream &operator<<(std::ostream &os, const err::no_t &x); + + class substr { + const char *_s; + const size_t _l; + public: + substr(const char *s, size_t len) : _s(s), _l(len) {} + friend std::ostream &operator<<(std::ostream &os, const substr &ss); + }; + + class die_t { + int _code; + int _errno; + public: + die_t() : _code(1), _errno(errno) {} + explicit die_t(int c) : _code(c), _errno(errno) {} + ~die_t() { + std::cerr << std::endl; + exit(_code); + } + + die_t & operator<<(const code &x) { + _code = x.get_code(); + return *this; + } + die_t & operator<<(const no_t &x) { + x.write(std::cerr, _errno); + return *this; + } + die_t & operator<<(const char *a[]) { + for(int i = 0; a[i]; i++) + std::cerr << (i ? "\n" : "") << a[i]; + return *this; + } + die_t & operator<<(const std::exception &e) { + std::cerr << e.what(); + return *this; + } + template + die_t & operator<<(const T &x) { + std::cerr << x; + return *this; + } + }; + + template + class raise_t { + int _errno; + std::ostringstream oss; + public: + raise_t() : _errno(errno) {} + ~raise_t() { throw err_t(oss.str()); } + + raise_t & operator<<(const no_t &x) { + x.write(oss, _errno); + return *this; + } + template + raise_t & operator<<(const T &x) { + oss << x; + return *this; + } + }; +} + + +#define die if(1) err::die_t() +#define eraise(e) if(1) err::raise_t() +#define define_error_class(name) \ + class name : public std::runtime_error { \ + public: explicit name(const std::string &txt) : std::runtime_error(txt) {} \ + } + +#endif diff --git a/src/inc/jellyfish/fastq_dumper.hpp b/src/inc/jellyfish/fastq_dumper.hpp new file mode 100644 index 00000000..1613f172 --- /dev/null +++ b/src/inc/jellyfish/fastq_dumper.hpp @@ -0,0 +1,131 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __FASTQ_DUMPER_HPP__ +#define __FASTQ_DUMPER_HPP__ + +#include +#include +#include +#include + +namespace jellyfish { + namespace fastq_hash { + define_error_class(ErrorReading); + static const char *file_type = "JFFSTQDN"; + struct header { + char type[8]; + uint64_t key_len; + uint64_t size; + uint64_t max_reprobes; + uint64_t values_pos; + + header(uint64_t _key_len, uint64_t _size, uint64_t _max_reprobes, + uint64_t _values_pos) : + key_len(_key_len), size(_size), max_reprobes(_max_reprobes), + values_pos(_values_pos) + { + memcpy(&type, file_type, sizeof(type)); + } + + explicit header(const char *ptr) { + if(memcmp(ptr, file_type, sizeof(type))) + eraise(ErrorReading) << "Bad file type '" << err::substr(ptr, sizeof(type)) + << "', expected '" << err::substr(file_type, sizeof(type)) << "'"; + memcpy((void *)this, ptr, sizeof(struct header)); + } + }; + + template + class raw_dumper : public dumper_t { + const uint_t threads; + const std::string file_prefix; + storage_t *const ary; + int file_index; + + public: + raw_dumper(uint_t _threads, const char *_file_prefix, size_t chunk_size, + storage_t *_ary) : + threads(_threads), file_prefix(_file_prefix), + ary(_ary), file_index(0) {} + + virtual void _dump(); + + static storage_t * read(const mapped_file &file); + static storage_t * read(const std::string &file); + static storage_t * read(const char *file); + }; + + template + void raw_dumper::_dump() { + std::ofstream _out; + open_next_file(file_prefix.c_str(), &file_index, _out); + + // TODO: the zeroing out of the hash is not parallelized. + + // Skip header + _out.seekp(sizeof(struct header)); + // Write matrices + ary->write_matrices(&_out); + // Write key set + ary->write_keys_blocks(&_out, 0, ary->get_size()); + std::streampos pos = _out.tellp(); + ary->zero_keys(0, ary->get_size()); + // Write values array + ary->write_values(&_out, 0, ary->get_size()); + ary->zero_values(0, ary->get_size()); + // Update header + _out.seekp(0); + struct header header(ary->get_key_len(), ary->get_size(), + ary->get_max_reprobe(), pos); + _out.write((char *)&header, sizeof(header)); + _out.close(); + } + + template + storage_t * raw_dumper::read(const std::string &file) { + mapped_file mf(file.c_str()); + return read(mf); + } + + template + storage_t * raw_dumper::read(const char *file) { + mapped_file mf(file); + return read(mf); + } + + template + storage_t * raw_dumper::read(const mapped_file &mf) { + if(mf.length() < sizeof(struct header)) + eraise(ErrorReading) << "File '" << mf.path() + << "' too short. Should be at least '" + << sizeof(struct header) << "' bytes"; + + struct header header(mf.base()); + size_t off = sizeof(header); + SquareBinaryMatrix hash_matrix, hash_inv_matrix; + off += hash_matrix.read(mf.base() + off); + off += hash_inv_matrix.read(mf.base() + off); + return new storage_t(mf.base() + off, + mf.base() + header.values_pos, + header.size, header.key_len, header.max_reprobes, + jellyfish::quadratic_reprobes, hash_matrix, + hash_inv_matrix); + } + } +} + +#endif diff --git a/src/inc/jellyfish/file_parser.hpp b/src/inc/jellyfish/file_parser.hpp new file mode 100644 index 00000000..a6cdd37c --- /dev/null +++ b/src/inc/jellyfish/file_parser.hpp @@ -0,0 +1,92 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_FILE_PARSER_HPP__ +#define __JELLYFISH_FILE_PARSER_HPP__ + +#include +#include +#include + +#include +#include +#include + +namespace jellyfish { + class file_parser { + int _fd; + int _base, _pbase; + char *_buffer; + const char *_end_buffer; + const char *_data; + const char *_end_data; + size_t _size; + bool _is_mmapped; + + static bool _do_mmap; + static bool _force_mmap; + + protected: + define_error_class(FileParserError); + // Get next character in "stream" + inline int sbumpc() { + _pbase = _base; + if(_data >= _end_data) + if(!read_next_buffer()) + return _base = _eof; + return (_base = *_data++); + } + int speekc() { + if(_data >= _end_data) + if(!read_next_buffer()) + return _eof; + return *_data; + } + + + public: + static const size_t _buff_size = 1024 * 1024; + static const int _eof = -1; + + // [str, str+len) is content on initial buffer + file_parser(int fd, const char *path, const char *str, size_t len, + char pbase = '\n'); + ~file_parser(); + + static bool do_mmap() { return _do_mmap; }; + static bool do_mmap(bool new_value) { bool oval = _do_mmap; _do_mmap = new_value; return oval; } + // throw an error if mmap fails + static bool force_mmap(); + static bool force_mmap(bool new_value); + static int file_peek(const char *path, char *peek); + + // current base and previous base + int base() const { return _base; } + int pbase() const { return _pbase; } + bool eof() const { return _base == _eof; } + + // ptr to base. Valid only for mmaped files + const char *ptr() const { return _data; } + const char *base_ptr() const { return _data - 1; } + const char *pbase_ptr() const { return _data; } + + private: + // Buffers next chunk of data. Returns _eof if at end of file or next character + bool read_next_buffer(); + }; +} + +#endif diff --git a/src/inc/jellyfish/floats.hpp b/src/inc/jellyfish/floats.hpp new file mode 100644 index 00000000..06042c27 --- /dev/null +++ b/src/inc/jellyfish/floats.hpp @@ -0,0 +1,71 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_FLOATS_HPP__ +#define __JELLYFISH_FLOATS_HPP__ + +#include +#include +#ifdef HALF_FLOATS +#include +#endif + +namespace jellyfish { + class Float { + public: +#ifdef HALF_FLOATS + typedef uint16_t bits_t; + typedef half float_t; +#else + typedef uint32_t bits_t; + typedef float float_t; +#endif + + private: + union float_int { + float_t fv; + bits_t iv; + explicit float_int(float_t v) : fv(v) {} + explicit float_int(bits_t v) : iv(v) {} + }; + float_int v; + + public: + Float() : v(0.0f) {} + explicit Float(int _v) : v((bits_t)_v) {} + explicit Float(float_t _v) : v(_v) {} + explicit Float(bits_t _v) : v(_v) {} + + // static const Float zero; + // static const Float one; + + const Float operator+(const Float &y) const { + return Float(v.fv + y.v.fv); + } + const Float operator+(const float& y) const { + return Float(v.fv + y); + } + + bits_t bits() const { return v.iv; }; + float_t to_float() const { return v.fv; }; + + // Should we use the floating point ==? + bool operator==(Float o) { return v.iv == o.v.iv; } + friend std::ostream &operator<<(std::ostream &os, const Float &f); + }; +} + +#endif diff --git a/src/inc/jellyfish/fstream_default.hpp b/src/inc/jellyfish/fstream_default.hpp new file mode 100644 index 00000000..0ac298ea --- /dev/null +++ b/src/inc/jellyfish/fstream_default.hpp @@ -0,0 +1,68 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_FSTREAM_WITH_DEFAULT_HPP__ +#define __JELLYFISH_FSTREAM_WITH_DEFAULT_HPP__ + +#include +#include + +template +class fstream_default : public Base { + typedef Base super; + static std::streambuf* open_file(const char* str, std::ios_base::openmode mode) { + std::filebuf* fb = new std::filebuf; + return fb->open(str, mode); + } + + static std::streambuf* get_streambuf(const char* str, Base& def, + std::ios_base::openmode mode) { + return (str != 0) ? open_file(str, mode) : def.rdbuf(); + } + static std::streambuf* get_streambuf(const char* str, std::streambuf* buf, + std::ios_base::openmode mode) { + return (str != 0) ? open_file(str, mode) : buf; + } + + bool do_close; +public: + fstream_default(const char* str, Base& def, std::ios_base::openmode mode = def_mode) : + Base(get_streambuf(str, def, mode)), do_close(str != 0) { + if(Base::rdbuf() == 0) + Base::setstate(std::ios_base::badbit); + } + fstream_default(const char* str, std::streambuf* def, std::ios_base::openmode mode = def_mode) : + Base(get_streambuf(str, def, mode)), do_close(str != 0) { + if(Base::rdbuf() == 0) + Base::setstate(std::ios_base::badbit); + } + + ~fstream_default() { + if(do_close) { + delete Base::rdbuf(0); + do_close = false; + } + } + // Close is a noop at this point as GCC 4.4 has a problem with + // Base::rdbuf in methods (breaks strict aliasing). Beats me! I + // think it is a false positive. + void close() {} +}; + +typedef fstream_default ofstream_default; +typedef fstream_default ifstream_default; + +#endif // __JELLYFISH_FSTREAM_WITH_DEFAULT_HPP__ diff --git a/src/inc/jellyfish/hash.hpp b/src/inc/jellyfish/hash.hpp new file mode 100644 index 00000000..6daeb7aa --- /dev/null +++ b/src/inc/jellyfish/hash.hpp @@ -0,0 +1,253 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_HASH_HPP__ +#define __JELLYFISH_HASH_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + /* Wrapper around a "storage". The hash class manages threads. In + particular, it synchronizes the threads for the size-doubling + operation and manages dumping the hash to disk. The storage class + is reponsible for the details of storing the key,value pairs, + memory management, reprobing, etc. + */ + + class hash_t { + public: + virtual ~hash_t() {} + }; + + template + class hash : public hash_t { + public: + define_error_class(TableFull); + // typedef typename std::pair kv_t; + typedef ary_t storage_t; + typedef typename ary_t::iterator iterator; + + hash() : ary(NULL), dumper(NULL), dumping_initiated(false) {} + explicit hash(ary_t *_ary) : ary(_ary), dumper(NULL), dumping_initiated(false) {} + + virtual ~hash() {} + + size_t get_size() const { return ary->get_size(); } + uint_t get_key_len() const { return ary->get_key_len(); } + uint_t get_val_len() const { return ary->get_val_len(); } + uint_t get_max_reprobe() const { return ary->get_max_reprobe(); } + size_t get_max_reprobe_offset() const { return ary->get_max_reprobe_offset(); } + + void set_dumper(dumper_t *new_dumper) { dumper = new_dumper; } + Time get_writing_time() const { + if(!dumper) + return Time::zero; + return dumper->get_writing_time(); + } + + void write_raw(std::ostream &out) { ary->write_raw(out); } + + iterator iterator_all() const { return ary->iterator_all(); } + iterator iterator_slice(size_t slice_number, size_t number_of_slice) const { + return ary->iterator_slice(slice_number, number_of_slice); + } + + /* + * Thread handle to the hash. + */ + enum status_t { FREE, INUSE, BLOCKED }; + class thread { + ary_t *ary; + size_t hsize_mask; + status_t status; + status_t ostatus; + hash *my_hash; + + public: + typedef val_t val_type; + thread(ary_t *_ary, hash *_my_hash) : + ary(_ary), hsize_mask(ary->get_size() - 1), status(FREE), my_hash(_my_hash) + { } + + // Add val to the value associated with key. Returns the old + // value in *oval if oval is not NULL + template + inline void add(key_t key, const add_t &val, val_t *oval = 0) { + while(true) { + while(atomic::cas(&status, FREE, INUSE) != FREE) + my_hash->wait_event_is_done(); + + if(ary->add(key, val, oval)) + break; + + // Improve this. Undefined behavior if dump_to_file throws an error. + if(my_hash->get_event_locks()) { + my_hash->dump(); + my_hash->release_event_locks(); + } + } + + if(atomic::cas(&status, INUSE, FREE) != INUSE) + my_hash->signal_not_in_use(); + } + + // void inc(key_t key, val_t *oval = 0) { return this->add(key, (add_t)1, oval); } + // inline void operator()(key_t key) { return this->add(key, (val_t)1); } + + friend class hash; + }; + friend class thread; + typedef std::list thread_list_t; + class thread_ptr_t : public thread_list_t::iterator { + public: + explicit thread_ptr_t(const typename thread_list_t::iterator &thl) : thread_list_t::iterator(thl) {} + typedef val_t val_type; + }; + // typedef typename thread_list_t::iterator thread_ptr_t; + + thread_ptr_t new_thread() { + user_thread_lock.lock(); + thread_ptr_t res(user_thread_list.insert(user_thread_list.begin(), thread(ary, this))); + user_thread_lock.unlock(); + return res; + } + + void release_thread(thread_ptr_t &th) { + user_thread_lock.lock(); + user_thread_list.erase(th); + user_thread_lock.unlock(); + } + + void dump() { + if(dumper) + dumper->dump(); + else + eraise(TableFull) << "No dumper defined"; + } + + private: + /** + * The following methods are called by threads to manage + * administrative events: size doubling or dumping the hash to + * disk. + **/ + + /** + * Called by a thread if it failed to switch its states from INUSE + * to FREE. It lets the thread triggering the event that the hash + * is free. This method returns after the signaling and does not + * wait for the handling of the event to be over. + **/ + void signal_not_in_use(bool take_inuse_lock = true) { + if(take_inuse_lock) + inuse_thread_cond.lock(); + if(--inuse_thread_count == 0) + inuse_thread_cond.signal(); + inuse_thread_cond.unlock(); + } + + /** + * Called by a thread if it failed to switch its states from FREE + * to INUSE. An event management has been initiated. This call + * waits for the event handling to be over. + **/ + void wait_event_is_done(bool take_event_lock = true) { + if(take_event_lock) + event_cond.lock(); + while(dumping_initiated) + event_cond.wait(); + event_cond.unlock(); + } + + /** + * Get the locks before handling an event and returns true if + * success. It guarantees than no thread is doing an operation on + * the hash. If another thread is already handling an event, + * degrade to wait_event_is_done and returns false. + **/ + bool get_event_locks() { + inuse_thread_cond.lock(); + event_cond.lock(); + if(dumping_initiated) { + // Another thread is doing the dumping + signal_not_in_use(false); + wait_event_is_done(false); + return false; + } + + // I am the thread doing the dumping + user_thread_lock.lock(); + dumping_initiated = true; + event_cond.unlock(); + + inuse_thread_count = 0; + + // Block access to hash and wait for threads with INUSE state + for(thread_ptr_t it(user_thread_list.begin()); + it != user_thread_list.end(); + it++) { + it->ostatus = atomic::set(&it->status, BLOCKED); + if(it->ostatus == INUSE) + inuse_thread_count++; + } + inuse_thread_count--; // Remove 1 for myself! + while(inuse_thread_count > 0) { + inuse_thread_cond.wait(); + } + inuse_thread_cond.unlock(); + + return true; + } + + void release_event_locks() { + event_cond.lock(); + for(thread_ptr_t it(user_thread_list.begin()); + it != user_thread_list.end(); + it++) { + atomic::set(&it->status, FREE); + } + user_thread_lock.unlock(); + dumping_initiated = false; + event_cond.broadcast(); + event_cond.unlock(); + } + + private: + ary_t *ary; + dumper_t *dumper; + volatile bool dumping_initiated; + thread_list_t user_thread_list; + locks::pthread::mutex user_thread_lock; + locks::pthread::cond event_cond; + locks::pthread::cond inuse_thread_cond; + volatile uint_t inuse_thread_count; + }; +} + +#endif // __HASH_HPP__ diff --git a/src/inc/jellyfish/heap.hpp b/src/inc/jellyfish/heap.hpp new file mode 100644 index 00000000..cae37f1c --- /dev/null +++ b/src/inc/jellyfish/heap.hpp @@ -0,0 +1,125 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_HEAP_HPP__ +#define __JELLYFISH_HEAP_HPP__ + +#include + +namespace jellyfish { + template + struct heap_item_t { + uint64_t key; + uint64_t val; + uint64_t pos; + iterator *it; + + heap_item_t() : key(0), val(0), pos(0) { } + + explicit heap_item_t(iterator &iter) { + initialize(iter); + } + + void initialize(iterator &iter) { + key = iter.key; + val = iter.val; + pos = iter.get_pos(); + it = &iter; + } + + // STL make_heap creates a max heap. We want a min heap, so + // reverse comparator! + bool operator<(const heap_item_t & other) { + if(pos == other.pos) + return key > other.key; + return pos > other.pos; + } + }; + + template + class heap_item_compare { + public: + inline bool operator() (heap_item_t *i1, heap_item_t *i2) { + return *i1 < *i2; + } + }; + + template + class heap_t { + heap_item_t *storage; + heap_item_t **elts; + size_t capacity_; + size_t h; + heap_item_compare compare; + public: + typedef const heap_item_t *const_item_t; + + heap_t() : storage(0), elts(0), capacity_(0), h(0) { } + explicit heap_t(size_t _capacity) { initialize(_capacity); } + ~heap_t() { + delete[] storage; + delete[] elts; + } + + void initialize(size_t _capacity) { + capacity_ = _capacity; + h = 0; + storage = new heap_item_t[capacity_]; + elts = new heap_item_t*[capacity_]; + for(size_t h1 = 0; h1 < capacity_; ++h1) + elts[h1] = &storage[h1]; + } + + void fill(iterator &it) { + h = 0; + while(h < capacity_) { + if(!it.next()) + break; + storage[h].initialize(it); + elts[h] = &storage[h]; + h++; + } + std::make_heap(elts, elts + h, compare); + } + template + void fill(ForwardIterator first, ForwardIterator last) { + h = 0; + while(h < capacity_ && first != last) { + if(!first->next()) + break; + storage[h].initialize(*first++); + elts[h] = &storage[h]; + h++; + } + std::make_heap(elts, elts + h, compare); + } + + bool is_empty() const { return h == 0; } + bool is_not_empty() const { return h > 0; } + size_t size() const { return h; } + size_t capacity() const { return capacity_; } + + // The following 3 should only be used after fill has been called + const_item_t head() const { return elts[0]; } + void pop() { std::pop_heap(elts, elts + h--, compare); } + void push(iterator &item) { + elts[h]->initialize(item); + std::push_heap(elts, elts + ++h, compare); + } + }; +} + +#endif // __HEAP_HPP__ diff --git a/src/inc/jellyfish/invertible_hash_array.hpp b/src/inc/jellyfish/invertible_hash_array.hpp new file mode 100644 index 00000000..f86f3f45 --- /dev/null +++ b/src/inc/jellyfish/invertible_hash_array.hpp @@ -0,0 +1,1015 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_REVERSIBLE_HASH__ +#define __JELLYFISH_REVERSIBLE_HASH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + namespace invertible_hash { + define_error_class(InvalidMap); + define_error_class(ErrorAllocation); + define_error_class(InvalidMatrix); + + /* Contains an integer, the reprobe limit. It is capped based on + * the reprobe strategy to not be bigger than the size of the hash + * array. + */ + class reprobe_limit_t { + uint_t limit; + public: + reprobe_limit_t(uint_t _limit, size_t *_reprobes, size_t _size) : + limit(_limit) + { + while(_reprobes[limit] >= _size && limit >= 1) + limit--; + } + inline uint_t val() const { return limit; } + }; + + /* (key,value) pair bit-packed array. It implements the logic of the + * packed hash except for size doubling. Setting or incrementing a key + * will return false if the hash is full. No memory management is done + * in this class either. + * + * The hash function is assumed to be invertible. The key is not + * directly stored in the hash. Let h = hash(key), size_table = + * 2**k and key_len = h+k. In the key field is written + * + * h high bits of h, reprobe value + 1 + * + * The +1 for the reprobe value is so that the field is guaranteed + * not to be zero. + */ + template + class array : public storage_t { + typedef typename Offsets::offset_t offset_t; + uint_t lsize; // log of size + size_t size, size_mask; + reprobe_limit_t reprobe_limit; + uint_t key_len; // original key len + word key_mask; // mask for high bits of hash(key) + uint_t key_off; // offset in key field for reprobe value + Offsets offsets; // key len reduced by size of hash array + mem_block_t mem_block; + word *data; + atomic_t atomic; + size_t *reprobes; + SquareBinaryMatrix hash_matrix; + SquareBinaryMatrix hash_inverse_matrix; + struct header { + uint64_t klen; + uint64_t clen; + uint64_t size; + uint64_t reprobe_limit; + }; + + public: + typedef word key_t; + typedef word val_t; + + array(size_t _size, uint_t _key_len, uint_t _val_len, + uint_t _reprobe_limit, size_t *_reprobes) : + lsize(ceilLog2(_size)), size(((size_t)1) << lsize), + size_mask(size - 1), + reprobe_limit(_reprobe_limit, _reprobes, size), key_len(_key_len), + key_mask(key_len <= lsize ? 0 : (((word)1) << (key_len - lsize)) - 1), + key_off(key_len <= lsize ? 0 : key_len - lsize), + offsets(key_off + bitsize(reprobe_limit.val() + 1), _val_len, + reprobe_limit.val() + 1), + mem_block(div_ceil(size, (size_t)offsets.get_block_len()) * offsets.get_block_word_len() * sizeof(word)), + data((word *)mem_block.get_ptr()), reprobes(_reprobes), + hash_matrix(key_len), + hash_inverse_matrix(hash_matrix.init_random_inverse()) + { + if(!data) + eraise(ErrorAllocation) << "Failed to allocate " + << (div_ceil(size, (size_t)offsets.get_block_len()) * offsets.get_block_word_len() * sizeof(word)) + << " bytes of memory"; + } + + // TODO: This parsing should be done in another class and use + // the following constructor. + // array(char *map, size_t length) : + // hash_matrix(0), hash_inverse_matrix(0) { + // if(length < sizeof(struct header)) + // eraise(InvalidMap) << "File truncated"; + // struct header *header = (struct header *)map; + // size = header->size; + // if(size != (1UL << floorLog2(size))) + // eraise(InvalidMap) << "Size '" << size << "' is not a power of 2"; + // lsize = ceilLog2(size); + // size_mask = size - 1; + // reprobe_limit = header->reprobe_limit; + // key_len = header->klen; + // if(key_len > 64 || key_len == 0) + // eraise(InvalidMap) << "Invalid key length '" << key_len << "'"; + // offsets.init(key_len + bitsize(reprobe_limit + 1) - lsize, header->clen, + // reprobe_limit); + // key_mask = (((word)1) << (key_len - lsize)) - 1; + // key_off = key_len - lsize; + // map += sizeof(struct header); + // // reprobes = new size_t[header->reprobe_limit + 1]; + // // TODO: should that be in the database file? + // reprobes = jellyfish::quadratic_reprobes; + // // memcpy(reprobes, map, sizeof(size_t) * (header->reprobe_limit + 1)); + // // map += sizeof(size_t) * (header->reprobe_limit + 1); + // map += hash_matrix.read(map); + // if((uint_t)hash_matrix.get_size() != key_len) + // eraise(InvalidMatrix) << "Size of hash matrix '" << hash_matrix.get_size() + // << "' not equal to key length '" << key_len << "'"; + // map += hash_inverse_matrix.read(map); + // if((uint_t)hash_inverse_matrix.get_size() != key_len) + // eraise(InvalidMatrix) << "Size of inverse hash matrix '" << hash_inverse_matrix.get_size() + // << "' not equal to key length '" << key_len << "'"; + // if((size_t)map & 0x7) + // map += 0x8 - ((size_t)map & 0x7); // Make sure aligned for 64bits word. TODO: use alignof? + // data = (word *)map; + // } + + // Assume _size is already a power of 2 + // map must point to a memory area written by "write_blocks". No header + array(char *map, size_t _size, uint_t _key_len, uint_t _val_len, + uint_t _reprobe_limit, size_t *_reprobes, + SquareBinaryMatrix &_hash_matrix, + SquareBinaryMatrix &_hash_inverse_matrix) : + lsize(ceilLog2(_size)), size(_size), size_mask(size-1), + reprobe_limit(_reprobe_limit, _reprobes, size), key_len(_key_len), + key_mask(key_len <= lsize ? 0 : (((word)1) << (key_len - lsize)) - 1), + key_off(key_len <= lsize ? 0 : key_len - lsize), + offsets(key_off + bitsize(reprobe_limit.val() + 1), _val_len, + reprobe_limit.val() + 1), + data((word *)map), reprobes(_reprobes), + hash_matrix(_hash_matrix), hash_inverse_matrix(_hash_inverse_matrix) + { } + + ~array() { } + + // Lock in memory + int lock() { + return mem_block.lock(); + } + + void set_matrix(SquareBinaryMatrix &m) { + if((uint_t)m.get_size() != key_len) + eraise(InvalidMatrix) << "Size of matrix '" << m.get_size() + << "' not equal to key length '" << key_len << "'"; + hash_matrix = m; + hash_inverse_matrix = m.inverse(); + } + + size_t get_size() const { return size; } + size_t get_lsize() const { return lsize; } + uint_t get_key_len() const { return key_len; } + uint_t get_val_len() const { return offsets.get_val_len(); } + + uint_t get_max_reprobe() const { return reprobe_limit.val(); } + size_t get_max_reprobe_offset() const { + return reprobes[reprobe_limit.val()]; + } + + + uint_t get_block_len() const { return offsets.get_block_len(); } + uint_t get_block_word_len() const { return offsets.get_block_word_len(); } + size_t floor_block(size_t entries, size_t &blocks) const { + return offsets.floor_block(entries, blocks); + } + + private: + void block_to_ptr(const size_t start, const size_t blen, + char **start_ptr, size_t *memlen) const { + *start_ptr = (char *)(data + start * offsets.get_block_word_len()); + char *end_ptr = (char *)mem_block.get_ptr() + mem_block.get_size(); + + if(*start_ptr >= end_ptr) { + *memlen = 0; + return; + } + *memlen = blen * offsets.get_block_word_len() * sizeof(word); + if(*start_ptr + *memlen > end_ptr) + *memlen = end_ptr - *start_ptr; + } + + public: + /** + * Zero out blocks in [start, start+length), where start and + * length are given in number of blocks. + **/ + void zero_blocks(const size_t start, const size_t length) { + char *start_ptr; + size_t memlen; + block_to_ptr(start, length, &start_ptr, &memlen); + memset(start_ptr, '\0', memlen); + } + + /** + * Write to out blocks [start, start+length). + */ + void write_blocks(std::ostream *out, const size_t start, const size_t length) const { + char *start_ptr; + size_t memlen; + block_to_ptr(start, length, &start_ptr, &memlen); + out->write(start_ptr, memlen); + } + + // Iterator + class iterator { + protected: + const array *ary; + size_t start_id, nid, end_id; + uint64_t mask; + char dna_str[33]; + public: + word key; + word val; + size_t id; + uint64_t hash; + + iterator(const array *_ary, size_t start, size_t end) : + ary(_ary), + start_id(start > ary->get_size() ? ary->get_size() : start), + nid(start), + end_id(end > ary->get_size() ? ary->get_size() : end), + mask(ary->get_size() - 1) + {} + + void get_string(char *out) const { + parse_dna::mer_binary_to_string(key, ary->get_key_len() / 2, out); + } + uint64_t get_hash() const { return hash; } + uint64_t get_pos() const { return hash & mask; } + uint64_t get_start() const { return start_id; } + uint64_t get_end() const { return end_id; } + word get_key() const { return key; } + word get_val() const { return val; } + size_t get_id() const { return id; } + char *get_dna_str() { + parse_dna::mer_binary_to_string(key, ary->get_key_len() / 2, dna_str); + return dna_str; + } + + bool next() { + bool success; + while((id = nid) < end_id) { + nid++; + success = ary->get_key_val_full(id, key, val); + if(success) { + hash = (key & ary->key_mask) << ary->lsize; + uint_t reprobep = (key >> ary->key_off) - 1; + hash |= (id - (reprobep > 0 ? ary->reprobes[reprobep] : 0)) & ary->size_mask; + key = ary->hash_inverse_matrix.times(hash); + return true; + } + } + return false; + } + }; + friend class iterator; + iterator iterator_all() const { return iterator(this, 0, get_size()); } + iterator iterator_slice(size_t slice_number, size_t number_of_slice) const { + std::pair res = slice(slice_number, number_of_slice, get_size()); + return iterator(this, res.first, res.second); + } + + /* Why on earth doesn't inheritance with : public iterator work + here? Resort to copying code. Arrrgggg.... + */ + class overlap_iterator { + protected: + const array *ary; + uint64_t mask; + size_t start_id, end_id, start_oid; + size_t moid, oid; + public: + word key; + word val; + size_t id; + uint64_t hash; + + overlap_iterator(const array *_ary, size_t start, size_t end) : + ary(_ary), + mask(ary->get_size() - 1), + start_id(start), + end_id(end > ary->get_size() ? ary->get_size() : end), + start_oid(start), + moid(end_id - start_id + ary->get_max_reprobe_offset()), + oid(0) + { + // Adjust for very small arrays and it overlaps with itself + if(moid > ary->get_size() - start_id) { + size_t last_id = (start_id + moid) % mask; + if(last_id > start_id) + moid -= last_id - start_id - 1; + } + } + + void get_string(char *out) const { + parse_dna::mer_binary_to_string(key, ary->get_key_len() / 2, out); + } + uint64_t get_hash() const { return hash; } + uint64_t get_pos() const { return hash & mask; } + uint64_t get_start() const { return start_id; } + uint64_t get_end() const { return end_id; } + + bool next() { + bool success; + while(oid < moid) { + id = (start_oid + oid++) & mask; + success = ary->get_key_val_full(id, key, val); + if(success) { + hash = (key & ary->key_mask) << ary->lsize; + uint_t reprobep = (key >> ary->key_off) - 1; + hash |= (id - (reprobep > 0 ? ary->reprobes[reprobep] : 0)) & ary->size_mask; + if(get_pos() < start_id || get_pos() >= end_id) + continue; + key = ary->hash_inverse_matrix.times(hash); + return true; + } + } + return false; + } + }; + friend class overlap_iterator; + + /* Return whether the entry is empty and if not, it returns the + * key and if it has the large bit set. + */ + void get_entry_stats(size_t id, bool &empty, word &key, bool &large) const { + word *w, *kvw = NULL; + const offset_t *o, *lo = NULL; + + w = offsets.get_word_offset(id, &o, &lo, data); + kvw = w + o->key.woff; + key = *kvw; + large = key & o->key.lb_mask; + if(large) + o = lo; + if(o->key.mask2) { + key = (key & o->key.mask1 & ~o->key.sb_mask1) >> o->key.boff; + key |= ((*(kvw+1)) & o->key.mask2 & ~o->key.sb_mask2) << o->key.shift; + } else { + key = (key & o->key.mask1) >> o->key.boff; + } + empty = key == 0; + } + + /* + * Return the key and value at position id. If the slot at id is + * empty, returns false. If the slot at position id has the large + * bit set, the key is resolved by looking backward in the + * table. The value returned on the other hand is the value at + * position id. No summation of the other entries for the key is + * done. + */ + bool get_key_val(size_t id, word &key, word &val) const { + word *w, *kvw, *fw = NULL; + const offset_t *o, *lo, *fo = NULL; + bool large; + uint_t overflows; + + overflows = 0; + while(true) { + w = offsets.get_word_offset(id, &o, &lo, data); + kvw = w + o->key.woff; + key = *kvw; + large = key & o->key.lb_mask; + if(large) + o = lo; + if(o->key.mask2) { + key = (key & o->key.mask1 & ~o->key.sb_mask1) >> o->key.boff; + key |= ((*(kvw+1)) & o->key.mask2 & ~o->key.sb_mask2) << o->key.shift; + } else { + key = (key & o->key.mask1) >> o->key.boff; + } + + // Save offset and word for value retrieval + if(!fo) { + fo = o; + fw = w; + } + + if(large) { + if(key) + id -= reprobes[key]; + id = (id - reprobes[0]) & size_mask; + overflows++; + } else { + break; + } + } + if(!key) + return false; + + kvw = fw + fo->val.woff; + val = ((*kvw) & fo->val.mask1) >> fo->val.boff; + if(fo->val.mask2) { + val |= ((*(kvw+1)) & fo->val.mask2) << fo->val.shift; + } + if(overflows > 0) { + val <<= offsets.get_val_len(); + if(--overflows > 0) + val <<= offsets.get_lval_len() * overflows; + } + return true; + } + + /* + * Return the key and value at position id. If the slot at id is + * empty or has the large bit set, returns false. Otherwise, + * returns the key and the value is the sum of all the entries + * in the hash table for that key. I.e., the table is search + * forward for entries with large bit set pointing back to the + * key at id, and all those values are summed up. + */ + bool get_key_val_full(size_t id, word &key, word &val, + bool carry_bit = false) const { + const offset_t *o, *lo; + word *w, *kvw, nkey, nval; + uint_t reprobe = 0, overflows = 0; + size_t cid; + + w = offsets.get_word_offset(id, &o, &lo, data); + kvw = w + o->key.woff; + key = *kvw; + if(key & o->key.lb_mask) + return false; + if(o->key.mask2) { + if((key & o->key.sb_mask1) == 0) + return false; + key = (key & o->key.mask1 & ~o->key.sb_mask1) >> o->key.boff; + key |= ((*(kvw+1)) & o->key.mask2 & ~o->key.sb_mask2) << o->key.shift; + } else { + key = (key & o->key.mask1) >> o->key.boff; + if(key == 0) + return false; + } + + kvw = w + o->val.woff; + val = ((*kvw) & o->val.mask1) >> o->val.boff; + if(o->val.mask2) + val |= ((*(kvw+1)) & o->val.mask2) << o->val.shift; + + if(carry_bit) { + bool do_reprobe = val & 0x1; + val >>= 1; + if(!do_reprobe) + return true; + } + + // Resolve value + reprobe = 0; + cid = id = (id + reprobes[0]) & size_mask; + while(reprobe <= reprobe_limit.val()) { + if(reprobe) + cid = (id + reprobes[reprobe]) & size_mask; + + w = offsets.get_word_offset(cid, &o, &lo, data); + kvw = w + o->key.woff; + nkey = *kvw; + if(nkey & o->key.lb_mask) { + if(lo->key.mask2) { + nkey = (nkey & lo->key.mask1 & ~lo->key.sb_mask1) >> lo->key.boff; + nkey |= ((*(kvw+1)) & lo->key.mask2 & ~lo->key.sb_mask2) << lo->key.shift; + } else { + nkey = (nkey & lo->key.mask1) >> lo->key.boff; + } + if(nkey == reprobe) { + kvw = w + lo->val.woff; + nval = ((*kvw) & lo->val.mask1) >> lo->val.boff; + if(lo->val.mask2) + nval |= ((*(kvw+1)) & lo->val.mask2) << lo->val.shift; + bool do_reprobe = true; + if(carry_bit) { + do_reprobe = nval & 0x1; + nval >>= 1; + } + + nval <<= offsets.get_val_len(); + nval <<= offsets.get_lval_len() * overflows; + val += nval; + + if(!do_reprobe) + return true; + + overflows++; + reprobe = 0; + cid = id = (cid + reprobes[0]) & size_mask; + continue; + } + } else { + if(o->key.mask2) { + if((nkey & o->key.sb_mask1) == 0) + return true; + } else { + if((nkey & o->key.mask1) == 0) + return true; + } + } + + reprobe++; + } + + return true; + } + + inline bool get_val(const word key, word &val, bool full = false, + bool carry_bit = false) const { + uint64_t hash = hash_matrix.times(key); + size_t key_id; + return _get_val(hash & size_mask, key_id, (hash >> lsize) & key_mask, val, + full, carry_bit); + } + + inline bool get_val(const word key, size_t &key_id, word &val, + bool full = false, bool carry_bit = false) const { + uint64_t hash = hash_matrix.times(key); + return _get_val(hash & size_mask, key_id, (hash >> lsize) & key_mask, val, + full, carry_bit); + } + + // /* InputIterator points to keys (words). OutputIterator points + // to struct containing at least the fields { bool found; size_t + // key_id; word val; }. + // */ + // template + // void get_multi_val(InputIterator key, const InputIterator& key_end, + // OutputIterator val, bool full, bool carry_bit) const { + // uint64_t phash, chash; + // const offset_t *po, *plo, *co, *clo; + // const word *pw, *cw; + + // if(key == key_end) + // return; + + // // Call __get_val with a delay. Let prefetch work while we + // // compute the hash/get the previous key. + // phash = hash_matrix.times(*key); + // pw = offsets.get_word_offset(phash & size_mask, &po, &plo, data); + // //__builtin_prefetch(pw + po->key.woff, 0, 3); + + // for(++key; key != key_end; ++key, ++val) { + // chash = hash_matrix.times(*key); + // cw = offsets.get_word_offset(chash & size_mask, &co, &clo, data); + // //__builtin_prefetch(cw + co->key.woff, 0, 3); + + // val->found = __get_val(phash & size_mask, val->key_id, + // (phash >> lsize) & key_mask, val->val, + // full, carry_bit, pw, po, plo); + + + // pw = cw; + // po = co; + // plo = clo; + // phash = chash; + // } + // // Last one + // val->found = __get_val(phash & size_mask, val->key_id, + // (phash >> lsize) & key_mask, val->val, + // full, carry_bit, pw, po, plo); + // } + + struct prefetch_info { + const word* w; + const offset_t *o, *lo; + }; + typedef simple_circular_buffer::pre_alloc prefetch_buffer; + + void warm_up_cache(prefetch_buffer& buffer, size_t id, bool load_lo) const { + buffer.clear(); + for(int i = 0; i < buffer.capacity(); ++i) { + buffer.push_back(); + prefetch_info& info = buffer.back(); + size_t cid = (id + (i > 0 ? reprobes[i] : 0)) & size_mask; + info.w = offsets.get_word_offset(cid, &info.o, &info.lo, data); + __builtin_prefetch(info.w + info.o->key.woff, 0, 1); + __builtin_prefetch(info.o, 0, 3); + if(load_lo) + __builtin_prefetch(info.lo, 0, 3); + } + } + + void prefetch_next(prefetch_buffer& buffer, size_t id, uint_t reprobe, bool load_lo) const { + buffer.pop_front(); + if(reprobe + buffer.capacity() <= reprobe_limit.val()) { + buffer.push_back(); + prefetch_info& info = buffer.back(); + size_t fid = (id + reprobes[reprobe + buffer.capacity() - 1]) & size_mask; + info.w = offsets.get_word_offset(fid, &info.o, &info.lo, data); + __builtin_prefetch(info.w + info.o->key.woff, 0, 1); + __builtin_prefetch(info.o, 0, 3); + if(load_lo) + __builtin_prefetch(info.lo, 0, 3); + } + + } + + bool _get_val(const size_t id, size_t &key_id, const word key, word &val, + bool full = false, bool carry_bit = false) const { + // Buffer for pre-cached information + prefetch_info info_ary[prefetch_buffer::capacity()]; + prefetch_buffer buffer(info_ary); + warm_up_cache(buffer, id, false); + + return __get_val(id, key_id, key, val, full, carry_bit, buffer); + } + + bool __get_val(const size_t id, size_t &key_id, const word key, word &val, + const bool full, bool carry_bit, + prefetch_buffer& buffer) const { + const word *kvw; + word nkey, nval; + size_t cid = id; + uint_t reprobe = 0; + word akey = key | ((word)1 << key_off); + + // Find key + const offset_t *o, *lo; + const word* w; + while(true) { + prefetch_info& info = buffer.front(); + w = info.w; + o = info.o; + kvw = w + o->key.woff; + nkey = *kvw; + + if(!(nkey & o->key.lb_mask)) { + if(o->key.mask2) { + nkey = (nkey & o->key.mask1 & ~o->key.sb_mask1) >> o->key.boff; + nkey |= ((*(kvw+1)) & o->key.mask2 & ~o->key.sb_mask2) << o->key.shift; + } else { + nkey = (nkey & o->key.mask1) >> o->key.boff; + } + if(nkey == akey) + break; + } + if(++reprobe > reprobe_limit.val()) + return false; + // Do reprobe + cid = (id + reprobes[reprobe]) & size_mask; + akey = key | ((reprobe + 1) << key_off); + + prefetch_next(buffer, id, reprobe, false); + } + + // Get value + kvw = w + o->val.woff; + val = ((*kvw) & o->val.mask1) >> o->val.boff; + if(o->val.mask2) { + val |= ((*(kvw+1)) & o->val.mask2) << o->val.shift; + } + bool do_reprobe = true; + if(carry_bit) { + do_reprobe = val & 0x1; + val >>= 1; + } + key_id = cid; + + // Eventually get large values... TODO: this seems buggy. It + // only looks for large values once, not as many times as + // needed. + if(full && do_reprobe) { + const size_t bid = (cid + reprobes[0]) & size_mask; + cid = bid; + + warm_up_cache(buffer, bid, true); + + reprobe = 0; + do { + prefetch_info& info = buffer.front(); + const word* w = info.w; + o = info.o; + lo = info.lo; + kvw = w + o->key.woff; + nkey = *kvw; + if(nkey & o->key.lb_mask) { + if(lo->key.mask2) { + nkey = (nkey & lo->key.mask1 & ~lo->key.sb_mask1) >> lo->key.boff; + nkey |= ((*(kvw+1)) & lo->key.mask2 & ~lo->key.sb_mask2) << lo->key.shift; + } else { + nkey = (nkey & lo->key.mask1) >> lo->key.boff; + } + if(nkey == reprobe) { + kvw = w + lo->val.woff; + nval = ((*kvw) & lo->val.mask1) >> lo->val.boff; + if(lo->val.mask2) + nval |= ((*(kvw+1)) & lo->val.mask2) << lo->val.shift; + if(carry_bit) { + nval >>= 1; + val |= nval << (offsets.get_val_len() - 1); + } else + val |= nval << offsets.get_val_len(); + break; // Should break only if carry_bit of nval is + // not set. Otherwise, we should reset the + // reprobe to 0 and try again. + } + } + + cid = (bid + reprobes[++reprobe]) & size_mask; + + prefetch_next(buffer, bid, reprobe, true); + } while(reprobe <= reprobe_limit.val()); + } + + return true; + } + + /** + * Use hash values as counters + */ + inline bool add(word key, word val, word *oval = 0) { + uint64_t hash = hash_matrix.times(key); + return add_rec(hash & size_mask, (hash >> lsize) & key_mask, + val, false, oval); + } + + + + /** + * Use hash as a set. + */ + inline bool add(word _key, bool *is_new) __attribute__((deprecated)) { + size_t id; + return set(_key, is_new, &id); + } + inline bool set(word _key, bool *is_new) { + size_t id; + return set(_key, is_new, &id); + } + bool add(word _key, bool *is_new, size_t *id) __attribute__((deprecated)) { + return set(_key, is_new, id); + } + bool set(word _key, bool *is_new, size_t *id) { + const offset_t *ao; + uint64_t hash = hash_matrix.times(_key); + word *w; + *id = hash & size_mask; + return claim_key((hash >> lsize) & key_mask, is_new, id, false, + &ao, &w); + } + + /** + * Use hash as a map. This sets a value with the key. It is only + * partially thread safe. I.e., multiple different key can be + * added concurrently. On the other hand, the same key can not + * be added at the same time by different thread: the value set + * may not be correct. + */ + inline bool map(word _key, word val) { + bool is_new; + return map(_key, val, &is_new); + } + + bool map(word _key, word val, bool* is_new) { + uint64_t hash = hash_matrix.times(_key); + return map_rec(hash & size_mask, (hash >> lsize) & key_mask, val, false, + is_new); + } + + void write_ary_header(std::ostream *out) const { + hash_matrix.dump(out); + hash_inverse_matrix.dump(out); + } + + void write_raw(std::ostream *out) const { + if(out->tellp() & 0x7) { // Make sure aligned + std::string padding(0x8 - (out->tellp() & 0x7), '\0'); + out->write(padding.c_str(), padding.size()); + } + out->write((char *)mem_block.get_ptr(), mem_block.get_size()); + } + + private: + /* id is input/output. Equal to hash & size_maks on input. Equal + * to actual id where key was set on output. key is already hash + * shifted and masked to get higher bits. (>> lsize & key_mask) + * + * is_new is set on output to true if key did not exists in hash + * before. *ao points to the actual offsets object. + */ + bool claim_key(const word &key, bool *is_new, size_t *id, bool large, + const offset_t **_ao, word **_w) { + uint_t reprobe = 0; + const offset_t *o, *lo, *ao; + word *w, *kw, nkey; + bool key_claimed = false; + size_t cid = *id; + word akey = large ? 0 :(key | ((word)1 << key_off)); + + do { + *_w = w = offsets.get_word_offset(cid, &o, &lo, data); + *_ao = ao = large ? lo : o; + + kw = w + ao->key.woff; + + if(ao->key.mask2) { // key split on two words + nkey = akey << ao->key.boff; + nkey |= ao->key.sb_mask1; + if(large) + nkey |= ao->key.lb_mask; + nkey &= ao->key.mask1; + + // Use o->key.mask1 and not ao->key.mask1 as the first one is + // guaranteed to be bigger. The key needs to be free on its + // longer mask to claim it! + key_claimed = set_key(kw, nkey, o->key.mask1, ao->key.mask1); + if(key_claimed) { + nkey = ((akey >> ao->key.shift) | ao->key.sb_mask2) & ao->key.mask2; + key_claimed = key_claimed && set_key(kw + 1, nkey, o->key.mask2, ao->key.mask2, is_new); + } + } else { // key on one word + nkey = akey << ao->key.boff; + if(large) + nkey |= ao->key.lb_mask; + nkey &= ao->key.mask1; + key_claimed = set_key(kw, nkey, o->key.mask1, ao->key.mask1, is_new); + } + if(!key_claimed) { // reprobe + if(++reprobe > reprobe_limit.val()) + return false; + cid = (*id + reprobes[reprobe]) & size_mask; + + if(large) + akey = reprobe; + else + akey = key | ((reprobe + 1) << key_off); + } + } while(!key_claimed); + + *id = cid; + return true; + } + + bool add_rec(size_t id, word key, word val, bool large, word *oval) { + const offset_t *ao; + word *w; + + bool is_new = false; + if(!claim_key(key, &is_new, &id, large, &ao, &w)) + return false; + if(oval) + *oval = !is_new; + + // Increment value + word *vw = w + ao->val.woff; + word cary = add_val(vw, val, ao->val.boff, ao->val.mask1); + cary >>= ao->val.shift; + if(cary && ao->val.mask2) { // value split on two words + cary = add_val(vw + 1, cary, 0, ao->val.mask2); + cary >>= ao->val.cshift; + } + if(cary) { + id = (id + reprobes[0]) & size_mask; + if(add_rec(id, key, cary, true, 0)) + return true; + + // Adding failed, table is full. Need to back-track and + // substract val. + cary = add_val(vw, ((word)1 << offsets.get_val_len()) - val, + ao->val.boff, ao->val.mask1); + cary >>= ao->val.shift; + if(cary && ao->val.mask2) { + // Can I ignore the cary here? Table is known to be full, so + // not much of a choice. But does it leave the table in a + // consistent state? + add_val(vw + 1, cary, 0, ao->val.mask2); + } + return false; + } + return true; + } + + // Store val in the hash at position id. Reprobe and recurse if + // val does not fit in counter field. A bit is added at the + // beginning of the counting field indicating whether there is + // another entry for the same key further in the hash. + bool map_rec(size_t id, word key, word val, bool large, bool* is_new) { + const offset_t* ao; + word* w; + bool is_new_entry = false; + if(!claim_key(key, &is_new_entry, &id, large, &ao, &w)) + return false; + if(is_new) + *is_new = is_new_entry; + + // Determine if there will be a carry + val <<= 1; + val |= (val > offsets.get_max_val(large)); + + // Set value + word *vw = w + ao->val.woff; + word oval = 0; + word cary = set_val(vw, val, ao->val.boff, ao->val.mask1, oval); + cary >>= ao->val.shift; + bool cary_bit = oval & 0x1; + if(ao->val.mask2) { // value split on two words. Write even if + // cary is 0 as there maybe some value in + // there already + cary = set_val(vw + 1, cary, 0, ao->val.mask2, oval); + cary >>= ao->val.cshift; + } + // Done if there is no carry and previous value did not have + // the carry_bit set + if(!cary && !cary_bit) + return true; + id = (id + reprobes[0]) & size_mask; + return map_rec(id, key, cary, true, 0); + } + + inline bool set_key(word *w, word nkey, word free_mask, + word equal_mask) { + word ow = *w, nw, okey; + + okey = ow & free_mask; + while(okey == 0) { // large bit not set && key is free + nw = atomic.cas(w, ow, ow | nkey); + if(nw == ow) + return true; + ow = nw; + okey = ow & free_mask; + } + return (ow & equal_mask) == nkey; + } + + inline bool set_key(word *w, word nkey, word free_mask, + word equal_mask, bool *is_new) { + word ow = *w, nw, okey; + + okey = ow & free_mask; + while(okey == 0) { // large bit not set && key is free + nw = atomic.cas(w, ow, ow | nkey); + if(nw == ow) { + *is_new = true; + return true; + } + ow = nw; + okey = ow & free_mask; + } + *is_new = false; + return (ow & equal_mask) == nkey; + } + + + inline word add_val(word *w, word val, uint_t shift, word mask) { + word now = *w, ow, nw, nval; + + do { + ow = now; + nval = ((ow & mask) >> shift) + val; + nw = (ow & ~mask) | ((nval << shift) & mask); + now = atomic.cas(w, ow, nw); + } while(now != ow); + + return nval & (~(mask >> shift)); + } + + inline word set_val(word *w, word val, uint_t shift, word mask, + word& oval) { + word now = *w, ow, nw; + word sval = (val << shift) & mask; + + do { + ow = now; + nw = (ow & ~mask) | sval; + now = atomic.cas(w, ow, nw); + } while(now != ow); + + oval = (ow & mask) >> shift; + return val & (~(mask >> shift)); + } + }; + + /*****/ + } +} + +#endif // __REVERSIBLE_HASH__ diff --git a/src/inc/jellyfish/locks_pthread.hpp b/src/inc/jellyfish/locks_pthread.hpp new file mode 100644 index 00000000..63d24224 --- /dev/null +++ b/src/inc/jellyfish/locks_pthread.hpp @@ -0,0 +1,172 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_LOCKS_PTHREAD_HPP__ +#define __JELLYFISH_LOCKS_PTHREAD_HPP__ + +#include + +namespace locks { + namespace pthread { + class cond + { + pthread_mutex_t _mutex; + pthread_cond_t _cond; + + public: + cond() { + pthread_mutex_init(&_mutex, NULL); + pthread_cond_init(&_cond, NULL); + } + + ~cond() { + pthread_cond_destroy(&_cond); + pthread_mutex_destroy(&_mutex); + } + + inline void lock() { pthread_mutex_lock(&_mutex); } + inline void unlock() { pthread_mutex_unlock(&_mutex); } + inline void wait() { pthread_cond_wait(&_cond, &_mutex); } + inline void signal() { pthread_cond_signal(&_cond); } + inline void broadcast() { pthread_cond_broadcast(&_cond); } + }; + + class mutex + { + pthread_mutex_t _mutex; + + public: + mutex() { + pthread_mutex_init(&_mutex, NULL); + } + + ~mutex() { + pthread_mutex_destroy(&_mutex); + } + + inline void lock() { pthread_mutex_lock(&_mutex); } + inline void unlock() { pthread_mutex_unlock(&_mutex); } + inline bool try_lock() { return !pthread_mutex_trylock(&_mutex); } + }; + + class Semaphore + { + int _value, _wakeups; + cond _cv; + public: + explicit Semaphore(int value) : + _value(value), + _wakeups(0) + { + // nothing to do + } + + ~Semaphore() {} + + inline void wait() { + _cv.lock(); + _value--; + if (_value < 0) { + do { + _cv.wait(); + } while(_wakeups < 1); + _wakeups--; + } + _cv.unlock(); + } + + inline void signal() { + _cv.lock(); + _value++; + if(_value <= 0) { + _wakeups++; + _cv.signal(); + } + _cv.unlock(); + } + }; + +#if defined(_POSIX_BARRIERS) && (_POSIX_BARRIERS - 20012L) >= 0 + class barrier + { + pthread_barrier_t _barrier; + + public: + explicit barrier(unsigned count) { + + pthread_barrier_init(&_barrier, NULL, count); + } + + ~barrier() { + pthread_barrier_destroy(&_barrier); + } + + inline int wait() { + return pthread_barrier_wait(&_barrier); + } + }; + +#else +# ifndef PTHREAD_BARRIER_SERIAL_THREAD +# define PTHREAD_BARRIER_SERIAL_THREAD 1 +# endif + + class barrier + { + int count; // required # of threads + int current; // current # of threads that have passed thru + mutex barlock; // protect current + Semaphore barrier1; // implement the barrier + Semaphore barrier2; + + public: + explicit barrier(unsigned cnt) + : count(cnt), current(0), barrier1(0), barrier2(0) { + } + + ~barrier() {} + + inline int wait() { + int ret = 0; + barlock.lock(); + current += 1; + if(current == count) { + ret = PTHREAD_BARRIER_SERIAL_THREAD; + for(int i=0; i. +*/ + +#ifndef __JELLYFISH_MAPPED_FILE_HPP__ +#define __JELLYFISH_MAPPED_FILE_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +class mapped_file { +protected: + std::string _path; + bool _unmap; + char *_base, *_end; + size_t _length; + + void map(const char *filename) { + int fd = open(filename, O_RDONLY); + struct stat stat; + + if(fd < 0) + eraise(ErrorMMap) << "Can't open file '" << filename << "'" << err::no; + + if(fstat(fd, &stat) < 0) + eraise(ErrorMMap) << "Can't stat file '" << filename << "'" << err::no; + + _length = stat.st_size; + _base = (char *)mmap(NULL, _length, PROT_READ, MAP_PRIVATE, fd, 0); + if(_base == MAP_FAILED) + eraise(ErrorMMap) << "Can't mmap file '" << filename << "'" << err::no; + close(fd); + _end = _base + _length; + } + +public: + define_error_class(ErrorMMap); + mapped_file(char *__base, size_t __length) : + _unmap(false), _base(__base), _end(__base + __length), _length(__length) {} + + explicit mapped_file(const char *filename) : _path(filename), _unmap(false) { + map(filename); + } + mapped_file(const mapped_file &mf) : + _path(mf.path()), _unmap(false), _base(mf._base), _end(mf._end), + _length(mf._length) {} + + ~mapped_file() { + if(_unmap) + unmap(); + } + + void unmap() { + if(!_base) + return; + munmap(_base, _length); + _base = 0; + _length = 0; + } + + char *base() const { return _base; } + char *end() const { return _end; } + size_t length() const { return _length; } + std::string path() const { return _path; } + + bool will_unmap(bool value = true) { + bool ovalue = _unmap; + _unmap = value; + return ovalue; + } + // No error checking here. Should I throw something? + const mapped_file & will_need() const { + madvise(_base, _length, MADV_WILLNEED); + return *this; + } + const mapped_file & sequential() const { + madvise(_base, _length, MADV_SEQUENTIAL); + return *this; + } + const mapped_file & random() const { + madvise(_base, _length, MADV_RANDOM); + return *this; + } + const mapped_file & lock() const { + if(mlock(_base, _length) < 0) + eraise(ErrorMMap) << "Can't lock map in memory" << err::no; + return *this; + } + + // Do not optimize. Side effect is that every page is accessed and should now be in cache. + // The flagg __attribute__((optimize(0))) does not compile, so return a useless char argument + char load() const; +}; + +class mapped_files_t : public std::vector { +public: + mapped_files_t(int nb_files, char *argv[]) { + for(int j = 0; j < nb_files; j++) + push_back(mapped_file(argv[j])); + } + + mapped_files_t(int nb_files, char *argv[], bool sequential) { + for(int j = 0; j < nb_files; j++) { + push_back(mapped_file(argv[j])); + if(sequential) + end()->sequential(); + } + } +}; + +// File mapped on demand. +class lazy_mapped_file_t : public mapped_file { + std::string _path; + volatile bool done; + volatile long used_counter; + +public: + explicit lazy_mapped_file_t(const char *path) : + mapped_file((char *)0, (size_t)0), + _path(path), done(false), used_counter(0) {} + + void map() { + used_counter = 1; + done = false; + mapped_file::map(_path.c_str()); + } + void unmap() { + done = true; + dec(); + } + + void inc() { + atomic::gcc::fetch_add(&used_counter, (long)1); + } + void dec() { + long val = atomic::gcc::add_fetch(&used_counter, (long)-1); + if(done && val == 0) + mapped_file::unmap(); + } +}; + +class lazy_mapped_files_t : public std::vector { +public: + lazy_mapped_files_t(int nb_files, char *argv[]) { + for(int j = 0; j < nb_files; j++) + push_back(lazy_mapped_file_t(argv[j])); + } + + lazy_mapped_files_t(int nb_files, char *argv[], bool sequential) { + for(int j = 0; j < nb_files; j++) { + push_back(lazy_mapped_file_t(argv[j])); + if(sequential) + end()->sequential(); + } + } +}; + +#endif diff --git a/src/inc/jellyfish/mer_counting.hpp b/src/inc/jellyfish/mer_counting.hpp new file mode 100644 index 00000000..42bc2c2f --- /dev/null +++ b/src/inc/jellyfish/mer_counting.hpp @@ -0,0 +1,68 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_MER_COUNTING__ +#define __JELLYFISH_MER_COUNTING__ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Invertible hash types +#include +#include +typedef jellyfish::invertible_hash::array inv_hash_storage_t; +typedef jellyfish::sorted_dumper< inv_hash_storage_t,atomic::gcc> inv_hash_dumper_t; +typedef jellyfish::raw_hash::dumper raw_inv_hash_dumper_t; +typedef jellyfish::raw_hash::query raw_inv_hash_query_t; +typedef jellyfish::hash< uint64_t,uint64_t,inv_hash_storage_t,atomic::gcc > inv_hash_t; + +// Direct indexing types +#include +#include +#include +typedef jellyfish::direct_indexing::array,atomic::gcc,allocators::mmap> direct_index_storage_t; +typedef jellyfish::direct_sorted_dumper< direct_index_storage_t, atomic::gcc> direct_index_dumper_t; +typedef jellyfish::hash< uint64_t,jellyfish::capped_integer,direct_index_storage_t,atomic::gcc> direct_index_t; + +// Quake types +#include +#include +#include +#include +typedef jellyfish::aligned_values::array fastq_storage_t; +typedef jellyfish::hash fastq_hash_t; +typedef jellyfish::fastq_hash::raw_dumper raw_fastq_dumper_t; + +// Compacted hash types +typedef jellyfish::compacted_hash::reader hash_reader_t; +typedef jellyfish::compacted_hash::query hash_query_t; +typedef jellyfish::compacted_hash::writer hash_writer_t; + +#endif /* __MER_COUNTING__ */ diff --git a/src/inc/jellyfish/misc.hpp b/src/inc/jellyfish/misc.hpp new file mode 100644 index 00000000..889adf51 --- /dev/null +++ b/src/inc/jellyfish/misc.hpp @@ -0,0 +1,144 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_MISC_HPP__ +#define __JELLYFISH_MISC_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define bsizeof(v) (8 * sizeof(v)) +typedef uint_fast64_t uint_t; +//#define UINT_C(x) +#define PRIUINTu PRIuFAST64 +#define PRIUINTx PRIxFAST64 + +inline int leading_zeroes(int x) { return __builtin_clz(x); } // CLK +inline int leading_zeroes(unsigned int x) { return __builtin_clz(x); } +inline int leading_zeroes(unsigned long x) { return __builtin_clzl(x); } +inline int leading_zeroes(unsigned long long x) { return __builtin_clzll(x); } + + +template +unsigned int floorLog2(T n) { + return sizeof(T) * 8 - 1 - leading_zeroes(n); +} + +template +uint_t ceilLog2(T n) { + uint_t r = floorLog2(n); + return n > (((T)1) << r) ? r + 1 : r; +} + +template +T div_ceil(T a, T b) { + T q = a / b; + return a % b == 0 ? q : q + 1; +} + +template +uint_t bitsize(T n) { + return floorLog2(n) + 1; +} + +inline uint32_t reverse_bits(uint32_t v) { + // swap odd and even bits + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + // swap consecutive pairs + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + // swap nibbles ... + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + // swap bytes + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + // swap 2-byte long pairs + v = ( v >> 16 ) | ( v << 16); + return v; +} + +inline uint64_t reverse_bits(uint64_t v) { + v = ((v >> 1) & 0x5555555555555555UL) | ((v & 0x5555555555555555UL) << 1); + v = ((v >> 2) & 0x3333333333333333UL) | ((v & 0x3333333333333333UL) << 2); + v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((v & 0x0F0F0F0F0F0F0F0FUL) << 4); + v = ((v >> 8) & 0x00FF00FF00FF00FFUL) | ((v & 0x00FF00FF00FF00FFUL) << 8); + v = ((v >> 16) & 0x0000FFFF0000FFFFUL) | ((v & 0x0000FFFF0000FFFFUL) << 16); + v = ( v >> 32 ) | ( v << 32); + return v; +} + +uint64_t bogus_sum(void *data, size_t len); + +template +size_t bits_to_bytes(T bits) { + return (size_t)((bits / 8) + (bits % 8 != 0)); +} + +template +union Tptr { + void *v; + T *t; +}; +template +T *calloc_align(size_t nmemb, size_t alignment) { + Tptr ptr; + if(posix_memalign(&ptr.v, alignment, sizeof(T) * nmemb) < 0) + throw std::bad_alloc(); + return ptr.t; +} + +/* Be pedantic about memory access. Any misaligned access will + * generate a BUS error. + */ +void disabled_misaligned_mem_access(); + +/* Raison d'etre of this version of mem_copy: It seems we have slow + * down due to misaligned cache accesses. glibc memcpy does unaligned + * memory accesses and crashes when they are disabled. This version + * does only aligned memory access (see above). + */ +template +void mem_copy(char *dest, const char *src, const T &len) { + // dumb copying char by char + for(T i = (T)0; i < len; ++i) + *dest++ = *src++; +} + +/* Slice a large number (total) in almost equal parts. return [start, + end) corresponding to the ith part (0 <= i < number_of_slices) + */ +template +std::pair slice(T i, T number_of_slices, T total) { + if(number_of_slices > 1) + --number_of_slices; + T slice_size = total / number_of_slices; + T slice_remain = total % number_of_slices; + T start = std::min(total, i * slice_size + i * slice_remain / number_of_slices); + T end = std::min(total, (i + 1) * slice_size + (i + 1) * slice_remain / number_of_slices); + return std::make_pair(start, end); +} + +std::streamoff get_file_size(std::istream& is); +#endif // __MISC_HPP__ diff --git a/src/inc/jellyfish/offsets_key_value.hpp b/src/inc/jellyfish/offsets_key_value.hpp new file mode 100644 index 00000000..fdc16e3a --- /dev/null +++ b/src/inc/jellyfish/offsets_key_value.hpp @@ -0,0 +1,239 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_OFFSETS_KEY_VALUE_HPP__ +#define __JELLYFISH_OFFSETS_KEY_VALUE_HPP__ + +#include +#include +#include + +namespace jellyfish { + /* A word is whatever aligned type used for atomic operations + * (CAS). Typically, a uint64_t. We store pairs of (key, value), in a + * bit packed fashion. The key and value can have abritrary size as + * long as they each fit in one word. A block is the largest number of + * (key, value) pair such that the first key, and only the first, + * starts at an aligned word. + * + * The key 0x0 is not valid. A key which fits completely within one + * word is not protected by a "set" bit. A key which straddle the + * boundary between two aligned words has a set bit in each parts. + * + * A value field can have any value and is initialized to 0x0. It has + * no "set" bit ever. + * + * A key is prefixed with a "large" bit. If this bit is 0, the key + * field is length key_len (not counting the possible set bits) and + * the value field has length val_len. If the large bit has value 1, + * the key field is just long enough to encode the number of + * reprobing hops to go backward to find the actual key. The + * remainder bits is used for the value field. In this scheme, we + * assume the length needed to encode the number of reprobes is much + * less than the length needed to encode the key. + */ + + /* Offsets holds all the possible offset for a given combination of + * key length, value length and reprobe limit. + */ + template + class Offsets { + public: + // woff: offset in words from beginning of block + // boff: offset in bits within that word. Past large bit. + // shift: shift in second word, if any + // mask1: includes the large bit and the set bit if any. + // mask2: mask in second word. Idem as mask1. 0 if fits in one word. + // sb_mask[12]: mask for set bit in word 1 and 2, if any. set bit is the + // last usable bit of the field. + // lb_mask: mask for the large bit. It is the first bit of the key field. + typedef struct { + struct { + uint_t woff, boff, shift, cshift; + word mask1, mask2, sb_mask1, sb_mask2, lb_mask; + } key; + struct { + uint_t woff, boff, shift, cshift; + word mask1, mask2; + } val; + } offset_t; + typedef struct { + offset_t normal; + offset_t large; + } offset_pair_t; + struct block_info { + uint_t len; + uint_t word_len; + }; + // Offsets() {} + + Offsets(uint_t _key_len, uint_t _val_len, uint_t _reprobe_limit) : + key_len(_key_len), + val_len(_val_len), + reprobe_limit(_reprobe_limit), + reprobe_len(bitsize(reprobe_limit)), + lval_len(key_len + val_len - reprobe_len), + block(compute_offsets()), + bld(block.len) + { } + + ~Offsets() {} + + uint_t get_block_len() const { return block.len; } + uint_t get_block_word_len() const { return block.word_len; } + uint_t get_reprobe_len() const { return reprobe_len; } + uint_t get_key_len() const { return key_len; } + uint_t get_val_len() const { return val_len; } + uint_t get_lval_len() const { return lval_len; } + word get_max_val(bool large) const { + return (((uint64_t)1) << (large ? lval_len : val_len)) - 1; + } + + // Discretize and round down number of entries according to length + // of a block. Return in blocks the number of blocks. + size_t floor_block(size_t entries, size_t &blocks) const { + blocks = entries / bld; + return block.len * blocks; + } + + word *get_word_offset(size_t id, const offset_t **o, const offset_t **lo, + word * const base) const { + uint64_t q, r; + bld.division(id, q, r); + word *w = base + (block.word_len * q); + *o = &offsets[r].normal; + *lo = &offsets[r].large; + return w; + } + + private: + const uint_t key_len, val_len; + const uint_t reprobe_limit, reprobe_len, lval_len; + const block_info block; + const divisor64 bld; // Fast divisor by block.len + offset_pair_t offsets[bsizeof(word)]; + + block_info compute_offsets(); + bool update_current_offsets(uint_t &cword, uint_t &cboff, uint_t add); + word mask(uint_t length, uint_t shift); + }; + + template + bool Offsets::update_current_offsets(uint_t &cword, uint_t &cboff, uint_t add) + { + cboff += add; + if(cboff >= bsizeof(word)) { + cword++; + cboff %= bsizeof(word); + return cboff > 0; + } + return false; + } + + template + word Offsets::mask(uint_t length, uint_t shift) + { + return (((word)1u << length) - 1) << shift; + } + + template + typename Offsets::block_info Offsets::compute_offsets() + { + offset_pair_t *offset = offsets; + uint_t cword = 0; // current word in block + uint_t cboff = 0; // current offset in word + uint_t lcword; // idem for large fields + uint_t lcboff; + uint_t ocboff; + + memset(offsets, '\0', sizeof(offsets)); + do { + offset->normal.key.woff = offset->large.key.woff = lcword = cword; + ocboff = lcboff = cboff; + offset->normal.key.boff = cboff + 1; + offset->normal.key.lb_mask = mask(1, cboff); + if(update_current_offsets(cword, cboff, key_len + 1)) { + // key extends over two words -> add extra set bits + update_current_offsets(cword, cboff, 2); + offset->normal.key.mask1 = mask(bsizeof(word) - ocboff, ocboff); + offset->normal.key.mask2 = mask(cboff, 0); + offset->normal.key.shift = key_len + 1 - cboff; + offset->normal.key.cshift = cboff - 1; + offset->normal.key.sb_mask1 = mask(1, bsizeof(word) - 1); + offset->normal.key.sb_mask2 = mask(1, cboff - 1); + } else { + offset->normal.key.mask1 = mask(key_len + 1, ocboff); + offset->normal.key.mask2 = 0; + offset->normal.key.shift = 0; + offset->normal.key.cshift = 0; + offset->normal.key.sb_mask1 = 0; + offset->normal.key.sb_mask2 = 0; + } + offset->normal.val.woff = cword; + offset->normal.val.boff = cboff; + offset->normal.val.mask1 = mask(val_len, cboff); + if(update_current_offsets(cword, cboff, val_len)) { + offset->normal.val.mask2 = mask(cboff, 0); + offset->normal.val.shift = val_len - cboff; + offset->normal.val.cshift = cboff; + } else { + offset->normal.val.mask2 = 0; + offset->normal.val.shift = val_len; + offset->normal.val.cshift = 0; + } + + ocboff = lcboff; + offset->large.key.boff = lcboff + 1; + offset->large.key.lb_mask = mask(1, lcboff); + if(update_current_offsets(lcword, lcboff, reprobe_len + 1)) { + update_current_offsets(lcword, lcboff, 2); + offset->large.key.mask1 = mask(bsizeof(word) - ocboff, ocboff); + offset->large.key.mask2 = mask(lcboff, 0); + offset->large.key.shift = reprobe_len + 1 - lcboff; + offset->large.key.cshift = lcboff - 1; + offset->large.key.sb_mask1 = mask(1, bsizeof(word) - 1); + offset->large.key.sb_mask2 = mask(1, lcboff - 1); + } else { + offset->large.key.mask1 = mask(reprobe_len + 1, ocboff); + offset->large.key.mask2 = 0; + offset->large.key.boff = ocboff + 1; + offset->large.key.shift = 0; + offset->large.key.cshift = 0; + offset->large.key.sb_mask1 = 0; + offset->large.key.sb_mask2 = 0; + } + offset->large.val.woff = lcword; + offset->large.val.boff = lcboff; + offset->large.val.mask1 = mask(lval_len, lcboff); + if(update_current_offsets(lcword, lcboff, lval_len)) { + offset->large.val.mask2 = mask(lcboff, 0); + offset->large.val.shift = lval_len - lcboff; + offset->large.val.cshift = lcboff; + } else { + offset->large.val.mask2 = 0; + offset->large.val.shift = lval_len; + offset->large.val.cshift = 0; + } + + offset++; + } while(cboff != 0 && cboff < bsizeof(word) - 2); + + block_info res = { static_cast(offset - offsets), cword + (cboff == 0 ? 0 : 1) }; + return res; + } +} + +#endif // __OFFSETS_KEY_VALUE_HPP__ diff --git a/src/inc/jellyfish/parse_dna.hpp b/src/inc/jellyfish/parse_dna.hpp new file mode 100644 index 00000000..cd1eb837 --- /dev/null +++ b/src/inc/jellyfish/parse_dna.hpp @@ -0,0 +1,194 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_PARSE_DNA_HPP__ +#define __JELLYFISH_PARSE_DNA_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + class parse_dna : public double_fifo_input { + typedef std::vector fary_t; + + uint_t mer_len; + size_t buffer_size; + const fary_t files; + fary_t::const_iterator current_file; + bool have_seam; + char *seam; + allocators::mmap buffer_data; + bool canonical; + sequence_parser *fparser; + + public: + /* Action to take for a given letter in fasta file: + * A, C, G, T: map to 0, 1, 2, 3. Append to kmer + * Other nucleic acid code: map to -1. reset kmer + * '\n': map to -2. ignore + * Other ASCII: map to -3. Report error. + */ + static uint64_t mer_string_to_binary(const char *in, uint_t klen) { + uint64_t res = 0; + for(uint_t i = 0; i < klen; i++) { + const uint_t c = dna_codes[(uint_t)*in++]; + if(c & CODE_NOT_DNA) + return 0; + res = (res << 2) | c; + } + return res; + } + static void mer_binary_to_string(uint64_t mer, uint_t klen, char *out) { + static const char table[4] = { 'A', 'C', 'G', 'T' }; + + for(unsigned int i = 0 ; i < klen; i++) { + out[klen-1-i] = table[mer & (uint64_t)0x3]; + mer >>= 2; + } + out[klen] = '\0'; + } + + static uint64_t reverse_complement(uint64_t v, uint_t length) { + v = ((v >> 2) & 0x3333333333333333UL) | ((v & 0x3333333333333333UL) << 2); + v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((v & 0x0F0F0F0F0F0F0F0FUL) << 4); + v = ((v >> 8) & 0x00FF00FF00FF00FFUL) | ((v & 0x00FF00FF00FF00FFUL) << 8); + v = ((v >> 16) & 0x0000FFFF0000FFFFUL) | ((v & 0x0000FFFF0000FFFFUL) << 16); + v = ( v >> 32 ) | ( v << 32); + return (((uint64_t)-1) - v) >> (bsizeof(v) - (length << 1)); + } + + template + parse_dna(T _files_start, T _files_end, uint_t _mer_len, + unsigned int nb_buffers, size_t _buffer_size); + + ~parse_dna() { + delete [] seam; + } + + void set_canonical(bool v = true) { canonical = v; } + virtual void fill(); + + class thread { + parse_dna *parser; + bucket_t *sequence; + const uint_t mer_len, lshift; + uint64_t kmer, rkmer; + const uint64_t masq; + uint_t cmlen; + const bool canonical; + uint64_t distinct, total; + typedef void (*error_reporter)(std::string& err); + error_reporter error_report; + + public: + explicit thread(parse_dna *_parser) : + parser(_parser), sequence(0), + mer_len(_parser->mer_len), lshift(2 * (mer_len - 1)), + kmer(0), rkmer(0), masq((1UL << (2 * mer_len)) - 1), + cmlen(0), canonical(parser->canonical), + distinct(0), total(0), error_report(0) {} + + uint64_t get_uniq() const { return 0; } + uint64_t get_distinct() const { return distinct; } + uint64_t get_total() const { return total; } + + template + void parse(T &counter) { + cmlen = kmer = rkmer = 0; + while((sequence = parser->next())) { + const char *start = sequence->start; + const char * const end = sequence->end; + while(start < end) { + const uint_t c = dna_codes[(uint_t)*start++]; + switch(c) { + case CODE_IGNORE: break; + case CODE_COMMENT: + report_bad_input(*(start-1)); + // Fall through + case CODE_RESET: + cmlen = kmer = rkmer = 0; + break; + + default: + kmer = ((kmer << 2) & masq) | c; + rkmer = (rkmer >> 2) | ((0x3 - c) << lshift); + if(++cmlen >= mer_len) { + cmlen = mer_len; + typename T::val_type oval; + if(canonical) + counter->add(kmer < rkmer ? kmer : rkmer, 1, &oval); + else + counter->add(kmer, 1, &oval); + distinct += oval == (typename T::val_type)0; + ++total; + } + } + } + + // Buffer exhausted. Get a new one + cmlen = kmer = rkmer = 0; + parser->release(sequence); + } + } + + void set_error_reporter(error_reporter e) { + error_report = e; + } + + private: + void report_bad_input(char c) { + if(!error_report) + return; + std::string err("Bad character in sequence: "); + err += c; + error_report(err); + } + }; + friend class thread; + thread new_thread() { return thread(this); } + }; +} + +template +jellyfish::parse_dna::parse_dna(T _files_start, T _files_end, + uint_t _mer_len, + unsigned int nb_buffers, size_t _buffer_size) : + double_fifo_input(nb_buffers), mer_len(_mer_len), + buffer_size(allocators::mmap::round_to_page(_buffer_size)), + files(_files_start, _files_end), current_file(files.begin()), + have_seam(false), buffer_data(buffer_size * nb_buffers), canonical(false) +{ + seam = new char[mer_len]; + memset(seam, 'A', mer_len); + + unsigned long i = 0; + for(bucket_iterator it = bucket_begin(); + it != bucket_end(); ++it, ++i) { + it->end = it->start = (char *)buffer_data.get_ptr() + i * buffer_size; + } + assert(i == nb_buffers); + + fparser = sequence_parser::new_parser(*current_file); +} + + +#endif diff --git a/src/inc/jellyfish/parse_quake.hpp b/src/inc/jellyfish/parse_quake.hpp new file mode 100644 index 00000000..1b55b271 --- /dev/null +++ b/src/inc/jellyfish/parse_quake.hpp @@ -0,0 +1,152 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_PARSE_QUAKE_HPP__ +#define __JELLYFISH_PARSE_QUAKE_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + class parse_quake : public double_fifo_input { + typedef std::vector fary_t; + + uint_t mer_len; + size_t buffer_size; + const fary_t files; + fary_t::const_iterator current_file; + bool have_seam; + allocators::mmap buffer_data; + struct seq *buffers; + char *seam; + const char quality_start; + bool canonical; + seq_qual_parser *fparser; + + public: + /* Action to take for a given letter in fasta file: + * A, C, G, T: map to 0, 1, 2, 3. Append to kmer + * Other nucleic acid code: map to -1. reset kmer + * '\n': map to -2. ignore + * Other ASCII: map to -3. Skip to next line + */ + + static const float proba_codes[41]; + static const float one_minus_proba_codes[41]; + + parse_quake(const fary_t &_files, uint_t _mer_len, + unsigned int nb_buffers, size_t _buffer_size, + const char _qs); + + ~parse_quake() { } + + void set_canonical(bool v = true) { canonical = v; } + virtual void fill(); + + class thread { + parse_quake *parser; + bucket_t *sequence; + const uint_t mer_len, lshift; + uint64_t kmer, rkmer; + const uint64_t masq; + uint_t cmlen; + const bool canonical; + circular_buffer quals; + const char quality_start; + uint64_t distinct, total; + typedef void (*error_reporter)(std::string& err); + error_reporter error_report; + + public: + thread(parse_quake *_parser, const char _qs) : + parser(_parser), sequence(0), + mer_len(_parser->mer_len), lshift(2 * (mer_len - 1)), + kmer(0), rkmer(0), masq((1UL << (2 * mer_len)) - 1), + cmlen(0), canonical(parser->canonical), quals(mer_len), + quality_start(_qs), + distinct(0), total(0), error_report(0) { } + + uint64_t get_distinct() const { return distinct; } + uint64_t get_total() const { return total; } + + template + void parse(T &counter) { + cmlen = kmer = rkmer = 0; + while((sequence = parser->next())) { + const char *start = sequence->start; + const char * const end = sequence->end; + while(start < end) { + const uint_t c = dna_codes[(uint_t)*start++]; + const char q = *start++; + switch(c) { + case CODE_IGNORE: break; + case CODE_COMMENT: + report_bad_input(*(start-2)); + // Fall through + case CODE_RESET: + cmlen = kmer = rkmer = 0; + break; + + default: + kmer = ((kmer << 2) & masq) | c; + rkmer = (rkmer >> 2) | ((0x3 - c) << lshift); + const float one_minus_p = one_minus_proba_codes[(uint_t)(q - quality_start)]; + quals.append(one_minus_p); + if(++cmlen >= mer_len) { + cmlen = mer_len; + Float oval; + + if(canonical) + counter->add(kmer < rkmer ? kmer : rkmer, quals.prod(), &oval); + else + counter->add(kmer, quals.prod(), &oval); + distinct += oval == (Float)0.0f; + ++total; + } + } + } + + // Buffer exhausted. Get a new one + cmlen = kmer = rkmer = 0; + parser->release(sequence); + } + } + + void set_error_reporter(error_reporter e) { + error_report = e; + } + private: + void report_bad_input(char c) { + if(!error_report) + return; + std::string error("Bad character in sequence: "); + error += c; + error_report(error); + } + }; + friend class thread; + thread new_thread() { return thread(this, quality_start); } + }; +} + +#endif diff --git a/src/inc/jellyfish/parse_qual_dna.hpp b/src/inc/jellyfish/parse_qual_dna.hpp new file mode 100644 index 00000000..cdaa2d02 --- /dev/null +++ b/src/inc/jellyfish/parse_qual_dna.hpp @@ -0,0 +1,147 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_PARSE_QUAL_DNA_HPP__ +#define __JELLYFISH_PARSE_QUAL_DNA_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + class parse_qual_dna : public double_fifo_input { + typedef std::vector fary_t; + + uint_t mer_len; + size_t buffer_size; + const fary_t files; + fary_t::const_iterator current_file; + bool have_seam; + allocators::mmap buffer_data; + char *seam; + const char quality_start; + const char min_q; + bool canonical; + seq_qual_parser *fparser; + + public: + /* Action to take for a given letter in fasta file: + * A, C, G, T: map to 0, 1, 2, 3. Append to kmer + * Other nucleic acid code: map to -1. reset kmer + * '\n': map to -2. ignore + * Other ASCII: map to -3. Skip to next line + */ + parse_qual_dna(const fary_t &_files, uint_t _mer_len, + unsigned int nb_buffers, size_t _buffer_size, + const char _qs, const char _min_q); + + ~parse_qual_dna() { } + + void set_canonical(bool v = true) { canonical = v; } + virtual void fill(); + + class thread { + parse_qual_dna *parser; + bucket_t *sequence; + const uint_t mer_len, lshift; + uint64_t kmer, rkmer; + const uint64_t masq; + uint_t cmlen; + const bool canonical; + const char q_thresh; + uint64_t distinct, total; + typedef void (*error_reporter)(std::string& err); + error_reporter error_report; + + public: + thread(parse_qual_dna *_parser, const char _qs, const char _min_q) : + parser(_parser), sequence(0), + mer_len(_parser->mer_len), lshift(2 * (mer_len - 1)), + kmer(0), rkmer(0), masq((1UL << (2 * mer_len)) - 1), + cmlen(0), canonical(parser->canonical), + q_thresh(_qs + _min_q), + distinct(0), total(0), error_report(0) { } + + uint64_t get_distinct() const { return distinct; } + uint64_t get_total() const { return total; } + + template + void parse(T &counter) { + cmlen = kmer = rkmer = 0; + while((sequence = parser->next())) { + const char *start = sequence->start; + const char * const end = sequence->end; + while(start < end) { + uint_t c = dna_codes[(uint_t)*start++]; + const char q = *start++; + if(q < q_thresh) + c = CODE_RESET; + + switch(c) { + case CODE_IGNORE: break; + case CODE_COMMENT: + report_bad_input(*(start-1)); + // Fall through + case CODE_RESET: + cmlen = kmer = rkmer = 0; + break; + + default: + kmer = ((kmer << 2) & masq) | c; + rkmer = (rkmer >> 2) | ((0x3 - c) << lshift); + if(++cmlen >= mer_len) { + cmlen = mer_len; + typename T::val_type oval; + if(canonical) + counter->add(kmer < rkmer ? kmer : rkmer, 1, &oval); + else + counter->add(kmer, 1, &oval); + distinct += !oval; + ++total; + } + } + } + + // Buffer exhausted. Get a new one + cmlen = kmer = rkmer = 0; + parser->release(sequence); + } + } + + void set_error_reporter(error_reporter e) { + error_report = e; + } + + private: + void report_bad_input(char c) { + if(!error_report) + return; + std::string error("Bad input: "); + error += c; + error_report(error); + } + }; + friend class thread; + thread new_thread() { return thread(this, quality_start, min_q); } + }; +} + +#endif diff --git a/src/inc/jellyfish/raw_dumper.hpp b/src/inc/jellyfish/raw_dumper.hpp new file mode 100644 index 00000000..3b9c8aff --- /dev/null +++ b/src/inc/jellyfish/raw_dumper.hpp @@ -0,0 +1,244 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __RAW_DUMPER_HPP__ +#define __RAW_DUMPER_HPP__ + +#include +#include +#include +#include +#include + +namespace jellyfish { + namespace raw_hash { + static const char *file_type __attribute__((used)) = "JFRHSHDN"; + struct header { + char type[8]; + uint64_t key_len; + uint64_t val_len; + uint64_t size; + uint64_t max_reprobe; + }; + define_error_class(ErrorReading); + template + class dumper : public dumper_t, public thread_exec { + typedef token_ring token_ring_t; + struct thread_info_t { + token_ring_t::token *token; + }; + const uint_t threads; + const std::string file_prefix; + storage_t *const ary; + int file_index; + token_ring_t tr; + + struct thread_info_t *thread_info; + size_t nb_records, nb_blocks; + std::ofstream *out; + + public: + dumper(uint_t _threads, const char *_file_prefix, size_t chunk_size, storage_t *_ary) : + threads(_threads), file_prefix(_file_prefix), + ary(_ary), file_index(0), tr() + { + nb_records = ary->floor_block(chunk_size / ary->get_block_len(), nb_blocks); + while(nb_records < ary->get_max_reprobe_offset()) { + nb_records = ary->floor_block(2 * nb_records, nb_blocks); + } + thread_info = new struct thread_info_t[threads]; + for(uint_t i = 0; i < threads; i++) { + thread_info[i].token = tr.new_token(); + } + } + + ~dumper() { + if(thread_info) { + delete[] thread_info; + } + } + + virtual void start(int i) { dump_to_file(i); } + void dump_to_file(int i); + void write_header(); + + virtual void _dump(); + }; + + template + class query { + public: + typedef _storage_t storage_t; + typedef typename storage_t::iterator iterator; + + private: + mapped_file _file; + storage_t *_ary; + bool _canonical; + bool _cary_bit; + SquareBinaryMatrix hash_matrix; + SquareBinaryMatrix hash_inverse_matrix; + + public: + explicit query(mapped_file &map) : + _file(map), _ary(0), _canonical(false), _cary_bit(false) { + _ary = init(_file, hash_matrix, hash_inverse_matrix); + } + explicit query(std::string filename) : + _file(filename.c_str()), _ary(0), _canonical(false), _cary_bit(false) + { + _ary = init(_file, hash_matrix, hash_inverse_matrix); + } + explicit query(const char* filename) : + _file(filename), _ary(0), _canonical(false), _cary_bit(false) + { + _ary = init(_file, hash_matrix, hash_inverse_matrix); + } + + ~query() { + if(_ary) + delete _ary; + } + + size_t get_size() const { return _ary->get_size(); } + size_t get_key_len() const { return _ary->get_key_len(); } + uint_t get_mer_len() const { return _ary->get_key_len() / 2; } + uint_t get_val_len() const { return _ary->get_val_len(); } + uint_t get_max_reprobe() const { return _ary->get_max_reprobe(); } + size_t get_max_reprobe_offset() const { return _ary->get_max_reprobe_offset(); } + bool get_canonical() const { return _canonical; } + void set_canonical(bool v) { _canonical = v; } + bool get_cary_bit() const { return _cary_bit; } + void set_cary_bit(bool v) { _cary_bit = v; } + SquareBinaryMatrix get_hash_matrix() { return hash_matrix; } + SquareBinaryMatrix get_hash_inverse_matrix() { return hash_inverse_matrix; } + storage_t *get_ary() const { return _ary; } + + iterator get_iterator() const { return iterator_all(); } + iterator iterator_all() const { return _ary->iterator_all(); } + iterator iterator_slice(size_t slice_number, size_t number_of_slice) const { + return _ary->iterator_slice(slice_number, number_of_slice); + } + + typename storage_t::val_t operator[](const char *key_s) const { + typename storage_t::key_t key = parse_dna::mer_string_to_binary(key_s, get_mer_len()); + return (*this)[key]; + } + typename storage_t::val_t operator[](const typename storage_t::key_t &key) const { + typename storage_t::val_t res = 0; + bool success; + if(_canonical) { + typename storage_t::key_t key2 = parse_dna::reverse_complement(key, get_mer_len()); + success = _ary->get_val(key2 < key ? key2 : key, res, true, _cary_bit); + } else + success = _ary->get_val(key, res, true, _cary_bit); + return success ? res : 0; + } + + bool has_key(const char *key_s) const { + return has_key(parse_dna::mer_string_to_binary(key_s, get_mer_len())); + } + bool has_key(const typename storage_t::key_t &key) const { + typename storage_t::val_t res = 0; + if(_canonical) { + typename storage_t::key_t key2 = parse_dna::reverse_complement(key, get_mer_len()); + return _ary->get_val(key2 < key ? key2 : key, res, false); + } else { + return _ary->get_val(key, res, false); + } + } + + + static storage_t* init(mapped_file& _file, + SquareBinaryMatrix& hash_matrix, + SquareBinaryMatrix& hash_inverse_matrix) { + if(_file.length() < sizeof(struct header)) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "File truncated"; + char *map = _file.base(); + struct header *header = (struct header *)map; + map += sizeof(struct header); + if(strncmp(header->type, file_type, sizeof(header->type))) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "Invalid file format '" + << err::substr(header->type, sizeof(header->type)) + << "'. Expected '" << file_type << "'."; + if(header->size != (1UL << floorLog2(header->size))) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "Size '" << header->size << "' is not a power of 2"; + if(header->key_len > 64 || header->key_len == 0) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "Invalid key length '" << header->key_len << "'"; + // TODO: Should that be in the file instead? + // reprobes = jellyfish::quadratic_reprobes; + map += hash_matrix.read(map); + if((uint_t)hash_matrix.get_size() != header->key_len) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "Size of hash matrix '" << hash_matrix.get_size() + << "' not equal to key length '" << header->key_len << "'"; + map += hash_inverse_matrix.read(map); + if((uint_t)hash_inverse_matrix.get_size() != header->key_len) + eraise(ErrorReading) << "'" << _file.path() << "': " + << "Size of inverse hash matrix '" << hash_inverse_matrix.get_size() + << "' not equal to key length '" << header->key_len << "'"; + if((size_t)map & 0x7) + map += 0x8 - ((size_t)map & 0x7); // Make sure aligned for 64bits word. TODO: use alignof? + return new storage_t(map, header->size, header->key_len, header->val_len, + header->max_reprobe, jellyfish::quadratic_reprobes, + hash_matrix, hash_inverse_matrix); + } + }; + + template + void dumper::_dump() { + std::ofstream _out; + open_next_file(file_prefix.c_str(), &file_index, _out); + out = &_out; + tr.reset(); + write_header(); + exec_join(threads); + _out.close(); + } + + template + void dumper::dump_to_file(int id) { + size_t i; + struct thread_info_t *my_info = &thread_info[id]; + + for(i = id; i * nb_records < ary->get_size(); i += threads) { + my_info->token->wait(); + ary->write_blocks(out, i * nb_blocks, nb_blocks); + my_info->token->pass(); + ary->zero_blocks(i * nb_blocks, nb_blocks); + } + } + + template + void dumper::write_header() { + struct header header; + memcpy(&header.type, file_type, sizeof(header.type)); + header.key_len = ary->get_key_len(); + header.val_len = ary->get_val_len(); + header.size = ary->get_size(); + header.max_reprobe = ary->get_max_reprobe(); + out->write((char *)&header, sizeof(header)); + ary->write_ary_header(out); + } + + } +} + +#endif diff --git a/src/inc/jellyfish/seq_qual_parser.hpp b/src/inc/jellyfish/seq_qual_parser.hpp new file mode 100644 index 00000000..c6c38cf5 --- /dev/null +++ b/src/inc/jellyfish/seq_qual_parser.hpp @@ -0,0 +1,66 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_SEQ_QUAL_PARSER_HPP__ +#define __JELLYFISH_SEQ_QUAL_PARSER_HPP__ + +#include +#include +#include +#include +#include + +namespace jellyfish { + class seq_qual_parser : public file_parser { + public: + seq_qual_parser(int fd, const char *path, const char *str, size_t len) : + file_parser(fd, path, str, len) {} + virtual ~seq_qual_parser() {} + + struct sequence_t { + char *start; + char *end; + }; + static seq_qual_parser *new_parser(const char *path); + + // parse some input data into the buffer [start, *end). Returns + // false if there is no more data in the input. **end is an + // input/output parameter. It points past the end of the buffer + // when called and should point past the end of the data when + // returned. The base and its ASCII qual value are one next to + // another. + virtual bool parse(char *start, char **end) = 0; + + protected: + }; + + class fastq_seq_qual_parser : public seq_qual_parser { + public: + fastq_seq_qual_parser(int fd, const char *path, const char *str, size_t len) : + seq_qual_parser(fd, path, str, len) {} + + virtual ~fastq_seq_qual_parser() {} + virtual bool parse(char *start, char **end); + + define_error_class(FastqSeqQualParserError); + + private: + void copy_qual_values(char *&qual_start, const char *start); + simple_growing_array _read_buf; + }; +} + +#endif diff --git a/src/inc/jellyfish/sequence_parser.hpp b/src/inc/jellyfish/sequence_parser.hpp new file mode 100644 index 00000000..5df903fe --- /dev/null +++ b/src/inc/jellyfish/sequence_parser.hpp @@ -0,0 +1,71 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_SEQUENCE_PARSER_HPP__ +#define __JELLYFISH_SEQUENCE_PARSER_HPP__ + +#include +#include +#include +#include +#include + +namespace jellyfish { + class sequence_parser : public file_parser { + protected: + public: + sequence_parser(int fd, const char *path, const char *str, size_t len) : + file_parser(fd, path, str, len) { } + virtual ~sequence_parser() { } + + struct sequence_t { + char *start; + char *end; + }; + static sequence_parser *new_parser(const char *path); + + // parse some input data into the buffer [start, *end). Returns + // false if there is no more data in the input. **end is an + // input/output parameter. It points past the end of the buffer + // when called and should point past the end of the data when + // returned. + virtual bool parse(char *start, char **end) = 0; + }; + + class fasta_sequence_parser : public sequence_parser { + public: + fasta_sequence_parser(int fd, const char *path, const char *str, size_t len) : + sequence_parser(fd, path, str, len) {} + + virtual ~fasta_sequence_parser() {} + + virtual bool parse(char *start, char **end); + }; + + class fastq_sequence_parser : public sequence_parser { + unsigned long seq_len; + public: + fastq_sequence_parser(int fd, const char *path, const char *str, + size_t len) : + sequence_parser(fd, path, str, len), seq_len(0) {} + + virtual ~fastq_sequence_parser() {} + virtual bool parse(char *start, char **end); + + }; +} + +#endif diff --git a/src/inc/jellyfish/simple_circular_buffer.hpp b/src/inc/jellyfish/simple_circular_buffer.hpp new file mode 100644 index 00000000..cf1f7d31 --- /dev/null +++ b/src/inc/jellyfish/simple_circular_buffer.hpp @@ -0,0 +1,134 @@ +/* Jellyfish + * Copyright (C) 2012 Genome group at University of Maryland. + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __SIMPLE_CIRCULAR_BUFFER_H__ +#define __SIMPLE_CIRCULAR_BUFFER_H__ + +#include + +namespace jellyfish { + namespace simple_circular_buffer { + + // T: type of element in container. D: type of derived class for + // CRTP. A: allocator type. + template + class base { + public: + explicit base(T* data) : + data_(data), front_(0), back_(0), full_(false) + { } + + // Return true if empty + bool empty() const { + return front_ == back_ && !full(); + } + // Return true if full + bool full() const { + return full_; + } + void clear() { + front_ = back_; + full_ = false; + } + + // Valid only if empty() is false + T& front() { + return data_[front_]; + } + // Valid only if empty() is false + T& back() { + return data_[prev_index(back_)]; + } + + // Unlike the corresponding method on list or deqeue, push_back may + // fail if full() is true. Then false is returned. + bool push_back(const T& x) { + if(full()) + return false; + data_[back_] = x; + back_ = next_index(back_); + full_ = back_ == front_; + return true; + } + + bool push_back() { + if(full()) + return false; + back_ = next_index(back_); + full_ = back_ == front_; + return true; + } + + // Pop an element from the front. It has no effect if empty() is true + void pop_front() { + if(empty()) + return; + front_ = next_index(front_); + full_ = false; + } + + int size() const { + if(full()) + return static_cast(this)->capacity(); + int s = back_ - front_; + return s < 0 ? s + static_cast(this)->capacity() : s; + } + + protected: + int next_index(int i) const { + return (i + 1) % static_cast(this)->capacity(); + } + int prev_index(int i) const { + return i ? i - 1 : static_cast(this)->capacity() - 1; + } + T* data() const { return data_; } + + T* data_; + int front_, back_; + bool full_; + }; + + template + class pre_alloc : public base > { + typedef base > super; + public: + explicit pre_alloc(T* data) : super(data) { } + static int capacity() { return capa; } + }; + + // template > + // class fixed : public base, A> { + // typedef base, A> super; + // public: + // explicit fixed(const T v = T()) : super(capa, v) { } + // // fixed(const int ignored_size, const T v = T()) : super(capa, v) { } + + // int capacity() const { return capa; } + // }; + + // template > + // class dyn : public base, A> { + // typedef base, A> super; + // public: + // explicit dyn(int size, const T v = T()) : super(size, v), capa_(size) { } + + // int capacity() const { return capa_; } + // int capa_; + // }; + } +} +#endif /* __SIMPLE_CIRCULAR_BUFFER_H__ */ diff --git a/src/inc/jellyfish/simple_growing_array.hpp b/src/inc/jellyfish/simple_growing_array.hpp new file mode 100644 index 00000000..9a7f117d --- /dev/null +++ b/src/inc/jellyfish/simple_growing_array.hpp @@ -0,0 +1,60 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +// Simple growing array. Never shrinks. No construction or destruction +// of objects in array, only copy with pushback() methods +namespace jellyfish { + template + class simple_growing_array { + size_t _capacity; + size_t _size; + T *_data; + public: + explicit simple_growing_array(size_t capacity = 100) : + _capacity(capacity / 2), _size(0), _data(0) { resize(); } + + virtual ~simple_growing_array() { + free(_data); + } + + void push_back(const T &x) { + if(_size >= _capacity) + resize(); + _data[_size++] = x; + } + + void reset() { _size = 0; } + + size_t size() const { return _size; } + bool empty() const { return _size == 0; } + const T * begin() const { return _data; } + const T * end() const { return _data + _size; } + + private: + define_error_class(SimpleGrowingArrayError); + void resize() { + _capacity *= 2; + void * ndata = realloc(_data, sizeof(T) * _capacity); + if(ndata == 0) { + free(ndata); + _data = 0; + _capacity = _capacity / 2; + eraise(SimpleGrowingArrayError) << "Out of memory" << err::no; + } + _data = (T*)ndata; + } + }; +} diff --git a/src/inc/jellyfish/sorted_dumper.hpp b/src/inc/jellyfish/sorted_dumper.hpp new file mode 100644 index 00000000..ef34738f --- /dev/null +++ b/src/inc/jellyfish/sorted_dumper.hpp @@ -0,0 +1,166 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#include +#include +#include +#include +#include +#include + +namespace jellyfish { + template + class sorted_dumper : public dumper_t, public thread_exec { + typedef typename storage_t::overlap_iterator iterator; + typedef compacted_hash::writer writer_t; + typedef heap_t oheap_t; + typedef token_ring token_ring_t; + + struct thread_info_t { + writer_t writer; + oheap_t heap; + token_ring_t::token *token; + }; + + uint_t threads; + std::string file_prefix; + size_t buffer_size; + uint_t klen, vlen; + uint_t key_len, val_len; + size_t record_len, nb_records, nb_blocks; + storage_t *ary; + int file_index; + token_ring_t tr; + uint64_t lower_count, upper_count; + struct thread_info_t *thread_info; + uint64_t volatile unique, distinct, total, max_count; + std::ofstream *out; + locks::pthread::mutex dump_mutex; + bool one_file; + + public: + // klen: key field length in bits in hash (i.e before rounding up to bytes) + // vlen: value field length in bits + sorted_dumper(uint_t _threads, const char *_file_prefix, size_t _buffer_size, + uint_t _vlen, storage_t *_ary) : + threads(_threads), file_prefix(_file_prefix), buffer_size(_buffer_size), + klen(_ary->get_key_len()), vlen(_vlen), ary(_ary), file_index(0), + tr(), lower_count(0), upper_count(std::numeric_limits::max()), + one_file(false) + { + key_len = bits_to_bytes(klen); + val_len = bits_to_bytes(vlen); + record_len = key_len + val_len; + nb_records = ary->floor_block(_buffer_size / record_len, nb_blocks); + while(nb_records < ary->get_max_reprobe_offset()) { + nb_records = ary->floor_block(2 * nb_records, nb_blocks); + } + + thread_info = new struct thread_info_t[threads]; + for(uint_t i = 0; i < threads; i++) { + // thread_info[i].token = i == 0; + thread_info[i].writer.initialize(nb_records, klen, vlen, ary); + thread_info[i].heap.initialize(ary->get_max_reprobe_offset()); + thread_info[i].token = tr.new_token(); + } + unique = distinct = total = max_count = 0; + } + + ~sorted_dumper() { + if(thread_info) { + delete[] thread_info; + } + } + + bool get_one_file() const { return one_file; } + void set_one_file(bool nv) { one_file = nv; } + + void set_lower_count(uint64_t l) { lower_count = l; } + void set_upper_count(uint64_t u) { upper_count = u; } + + virtual void start(int i) { dump_to_file(i); } + void dump_to_file(int i); + + virtual void _dump(); + void update_stats() { + thread_info[0].writer.update_stats_with(out, unique, distinct, total, + max_count); + } + }; + + template + void sorted_dumper::_dump() { + std::ofstream _out; + assert(dump_mutex.try_lock()); + if(one_file) { + _out.open(file_prefix.c_str()); + } else { + open_next_file(file_prefix.c_str(), &file_index, _out); + } + out = &_out; + unique = distinct = total = max_count = 0; + tr.reset(); + thread_info[0].writer.write_header(out); + exec_join(threads); + ary->zero_blocks(0, nb_blocks); // zero out last group of blocks + update_stats(); + _out.close(); + dump_mutex.unlock(); + } + + template + void sorted_dumper::dump_to_file(int id) { + size_t i; + struct thread_info_t *my_info = &thread_info[id]; + atomic_t atomic; + + my_info->writer.reset_counters(); + + for(i = id; i * nb_records < ary->get_size(); i += threads) { + // fill up buffer + iterator it(ary, i * nb_records, (i + 1) * nb_records); + my_info->heap.fill(it); + + while(it.next()) { + typename oheap_t::const_item_t item = my_info->heap.head(); + if(item->val >= lower_count && item->val <= upper_count) + my_info->writer.append(item->key, item->val); + my_info->heap.pop(); + my_info->heap.push(it); + } + + while(my_info->heap.is_not_empty()) { + typename oheap_t::const_item_t item = my_info->heap.head(); + if(item->val >= lower_count && item->val <= upper_count) + my_info->writer.append(item->key, item->val); + my_info->heap.pop(); + } + + my_info->token->wait(); + my_info->writer.dump(out); + my_info->token->pass(); + + // zero out memory + if(i > 0) + ary->zero_blocks(i * nb_blocks, nb_blocks); + } + + atomic.add_fetch(&unique, my_info->writer.get_unique()); + atomic.add_fetch(&distinct, my_info->writer.get_distinct()); + atomic.add_fetch(&total, my_info->writer.get_total()); + atomic.set_to_max(&max_count, my_info->writer.get_max_count()); + } +} diff --git a/src/inc/jellyfish/square_binary_matrix.hpp b/src/inc/jellyfish/square_binary_matrix.hpp new file mode 100644 index 00000000..67e3f495 --- /dev/null +++ b/src/inc/jellyfish/square_binary_matrix.hpp @@ -0,0 +1,203 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_SQUARE_BINARY_MATRIX_HPP__ +#define __JELLYFISH_SQUARE_BINARY_MATRIX_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class SquareBinaryMatrix { +public: + define_error_class(ErrorAllocation); + define_error_class(SingularMatrix); + define_error_class(MismatchingSize); + +private: + uint64_t *columns; + int size; + + uint64_t mask() const { return (((uint64_t)1) << size) - 1; } + uint64_t msb() const { return ((uint64_t)1) << (size - 1); } + uint64_t *first_alloc(size_t size) { + uint64_t *res = calloc_align((size_t)size, (size_t)16); + if(!res) + eraise(ErrorAllocation) << "Can't allocate matrix of size '" << size << "'"; + return res; + } + void alloc_columns() { + if(columns) { + free(columns); + columns = 0; + } + if(size < 0 || size > 64) + eraise(MismatchingSize) << "Invalid matrix size '" << size << "'"; + columns = first_alloc(size); + } + +public: + SquareBinaryMatrix() : columns(0), size(0) { } + + explicit SquareBinaryMatrix(int _size) :columns(first_alloc(_size)), size(_size) { + memset(columns, '\0', sizeof(uint64_t) * _size); + } + SquareBinaryMatrix(const SquareBinaryMatrix &rhs) : columns(first_alloc(rhs.get_size())), size(rhs.get_size()) { + int i; + + uint64_t _mask = mask(); + for(i = 0; i < size; i++) + columns[i] = rhs.columns[i] & _mask; + } + SquareBinaryMatrix(const uint64_t *_columns, int _size) : columns(first_alloc(_size)), size(_size) { + int i; + uint64_t _mask = mask(); + + for(i = 0; i < size; i++) + columns[i] = _columns[i] & _mask; + } + explicit SquareBinaryMatrix(const char *map) : columns(0), size(0) { + read(map); + } + explicit SquareBinaryMatrix(std::istream *is) : columns(0), size(0) { + load(is); + } + + ~SquareBinaryMatrix() { + if(columns) + free(columns); + } + + void swap(SquareBinaryMatrix &rhs) { + std::swap(columns, rhs.columns); + std::swap(size, rhs.size); + } + + SquareBinaryMatrix &operator=(SquareBinaryMatrix rhs) { + this->swap(rhs); + return *this; + } + + void init_random(); + + SquareBinaryMatrix init_random_inverse(); + + void init_identity() { + uint64_t v = msb(); + int i; + for(i = 0; v; i++, v >>= 1) + columns[i] = v; + } + + bool is_identity() const { + uint64_t v = msb(); + int i; + + for(i = 0; i < size; i++, v >>= 1) { + if(columns[i] != v) + return false; + } + return true; + } + + bool is_zero() const { + int i; + for(i = 0; i < size; i++) + if(columns[i]) + return false; + return true; + } + + void resize(int ns) { size = ns; alloc_columns(); } + int get_size() const { return size; } + + bool operator==(const SquareBinaryMatrix &other) const { + int i; + if(size != other.get_size()) + return false; + for(i = 0; i < size; i++) + if(columns[i] != other.columns[i]) + return false; + + return true; + } + + bool operator!=(const SquareBinaryMatrix &other) const { + return !(*this == other); + } + + uint64_t & operator[](int i) { + return columns[i]; + } + + uint64_t operator[](int i) const { + return columns[i]; + } + + uint64_t times_loop(uint64_t v) const { + uint64_t res = 0, *c = columns+(size-1); + + for ( ; v; v >>= 1) + res ^= (-(v & 1)) & *c--; + return res; + } + + uint64_t times_unrolled(uint64_t v) const; + uint64_t times_sse(uint64_t v) const; + + inline uint64_t times(uint64_t v) const { +#ifdef HAVE_SSE + return times_sse(v); +#else + return times_unrolled(v); +#endif + } + + SquareBinaryMatrix transpose() const; + SquareBinaryMatrix operator*(const SquareBinaryMatrix &other) const; + + SquareBinaryMatrix inverse() const; + int pop_count() const { + int i, res = 0; + for(i = 0; i < size; i++) + res += __builtin_popcountl(columns[i]); + return res; + } + + uint64_t xor_sum() const { + uint64_t sum = 0; + for(int i = 0; i < size; ++i) + sum ^= columns[i]; + return sum; + } + + void print(std::ostream *os) const; + std::string str() const; + void dump(std::ostream *os) const; + void load(std::istream *is); + size_t read(const char *map); + size_t dump_size() { return sizeof(size) + sizeof(uint64_t) * size; } + void print_vector(std::ostream *os, uint64_t v, bool vertical = false) const; + std::string str_vector(uint64_t v, bool vertical = false) const; +}; + +#endif // __SQUARE_BINARY_MATRIX_HPP__ diff --git a/src/inc/jellyfish/storage.hpp b/src/inc/jellyfish/storage.hpp new file mode 100644 index 00000000..692df2de --- /dev/null +++ b/src/inc/jellyfish/storage.hpp @@ -0,0 +1,37 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_STORAGE_HPP__ +#define __JELLYFISH_STORAGE_HPP__ + +#include +#include +#include + +namespace jellyfish { + + class storage_t { + public: + storage_t() {} + virtual ~storage_t() {} + }; + + // Entry 0 is used only when switching to a large field + extern size_t *quadratic_reprobes; + +} + +#endif // __STORAGE_HPP__ diff --git a/src/inc/jellyfish/thread_exec.hpp b/src/inc/jellyfish/thread_exec.hpp new file mode 100644 index 00000000..c0dcb2ff --- /dev/null +++ b/src/inc/jellyfish/thread_exec.hpp @@ -0,0 +1,51 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_THREAD_EXEC_HPP__ +#define __JELLYFISH_THREAD_EXEC_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +class thread_exec { + struct thread_info { + int id; + pthread_t thid; + thread_exec *self; + }; + static void *start_routine(void *); + std::vector infos; + +public: + define_error_class(Error); + thread_exec() {} + virtual ~thread_exec() {} + virtual void start(int id) = 0; + void exec(int nb_threads); + void join(); + void exec_join(int nb_threads) { + exec(nb_threads); + join(); + } +}; + +#endif // __THREAD_EXEC_HPP__ diff --git a/src/inc/jellyfish/time.hpp b/src/inc/jellyfish/time.hpp new file mode 100644 index 00000000..e490c1cd --- /dev/null +++ b/src/inc/jellyfish/time.hpp @@ -0,0 +1,92 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_TIME_HPP__ +#define __JELLYFISH_TIME_HPP__ + +#include +#include +#include +#include + +class Time { + static const suseconds_t max_useconds = 1000000UL; + struct timeval tv; + + public: + static const Time zero; + explicit Time(bool init = true) { + if(init) + now(); + } + Time(time_t sec, suseconds_t usec) { + tv.tv_sec = sec; + tv.tv_usec = usec; + } + Time &operator=(const Time &o) { + if(&o != this) { + tv.tv_sec = o.tv.tv_sec; + tv.tv_usec = o.tv.tv_usec; + } + return *this; + } + + Time & operator-=(const Time &o) { + tv.tv_sec -= o.tv.tv_sec; + if(o.tv.tv_usec > tv.tv_usec) { + tv.tv_usec = (max_useconds + tv.tv_usec) - o.tv.tv_usec; + --tv.tv_sec; + } else { + tv.tv_usec -= o.tv.tv_usec; + } + return *this; + } + const Time operator-(const Time &o) const { + return Time(*this) -= o; + } + + Time & operator+=(const Time &o) { + tv.tv_sec += o.tv.tv_sec; + tv.tv_usec += o.tv.tv_usec; + if(tv.tv_usec >= max_useconds) { + ++tv.tv_sec; + tv.tv_usec -= max_useconds; + } + return *this; + } + const Time operator+(const Time &o) const { + return Time(*this) += o; + } + + bool operator<(const Time& o) const { + return tv.tv_sec < o.tv.tv_sec || (tv.tv_sec == o.tv.tv_sec && tv.tv_usec < o.tv.tv_usec); + } + + void now() { gettimeofday(&tv, NULL); } + Time elapsed() const { + return Time() - *this; + } + + + std::string str() const { + std::ostringstream res; + res << tv.tv_sec << "." + << std::setfill('0') << std::setw(6) << std::right << tv.tv_usec; + return res.str(); + } +}; + +#endif // __TIME_HPP__ diff --git a/src/inc/jellyfish/token_ring.hpp b/src/inc/jellyfish/token_ring.hpp new file mode 100644 index 00000000..69cc7bbe --- /dev/null +++ b/src/inc/jellyfish/token_ring.hpp @@ -0,0 +1,97 @@ +/* This file is part of Jellyfish. + + Jellyfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Jellyfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Jellyfish. If not, see . +*/ + +#ifndef __JELLYFISH_TOKEN_RING_HPP__ +#define __JELLYFISH_TOKEN_RING_HPP__ + +template +class token_ring { +public: + class token { + token *next; + bool val; + cond_t cond; + + token(token *_next, bool _val) : + next(_next), val(_val) {} + friend class token_ring; + + public: + bool is_active() { return val; } + void wait() { + cond.lock(); + while(!val) { cond.wait(); } + cond.unlock(); + } + + void pass() { + next->cond.lock(); + val = false; + next->val = true; + next->cond.signal(); + next->cond.unlock(); + } + }; + +private: + token *first, *last; + cond_t cond; + +public: + token_ring() : + first(0), last(0) + { } + + ~token_ring() { + if(!first) + return; + + while(first != last) { + token *del = first; + first = first->next; + delete del; + } + delete last; + } + + void reset() { + if(!first) + return; + + token *c = first; + while(c != last) { + c->val = false; + c = c->next; + } + last->val = false; + first->val = true; + } + + + token *new_token() { + token *nt = new token(first, first == 0); + if(first) { + last->next = nt; + last = nt; + } else { + first = last = nt; + nt->next = nt; + } + return nt; + } +}; + +#endif diff --git a/src/inc/kseq/kseq.h b/src/inc/kseq/kseq.h new file mode 100644 index 00000000..bbe01255 --- /dev/null +++ b/src/inc/kseq/kseq.h @@ -0,0 +1,223 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/src/inc/zlib/zconf.h b/src/inc/zlib/zconf.h new file mode 100644 index 00000000..996fff29 --- /dev/null +++ b/src/inc/zlib/zconf.h @@ -0,0 +1,511 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2013 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + * Even better than compiling with -DZ_PREFIX would be to use configure to set + * this permanently in zconf.h using "./configure --zprefix". + */ +#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */ +# define Z_PREFIX_SET + +/* all linked symbols */ +# define _dist_code z__dist_code +# define _length_code z__length_code +# define _tr_align z__tr_align +# define _tr_flush_bits z__tr_flush_bits +# define _tr_flush_block z__tr_flush_block +# define _tr_init z__tr_init +# define _tr_stored_block z__tr_stored_block +# define _tr_tally z__tr_tally +# define adler32 z_adler32 +# define adler32_combine z_adler32_combine +# define adler32_combine64 z_adler32_combine64 +# ifndef Z_SOLO +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# endif +# define crc32 z_crc32 +# define crc32_combine z_crc32_combine +# define crc32_combine64 z_crc32_combine64 +# define deflate z_deflate +# define deflateBound z_deflateBound +# define deflateCopy z_deflateCopy +# define deflateEnd z_deflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateInit_ z_deflateInit_ +# define deflateParams z_deflateParams +# define deflatePending z_deflatePending +# define deflatePrime z_deflatePrime +# define deflateReset z_deflateReset +# define deflateResetKeep z_deflateResetKeep +# define deflateSetDictionary z_deflateSetDictionary +# define deflateSetHeader z_deflateSetHeader +# define deflateTune z_deflateTune +# define deflate_copyright z_deflate_copyright +# define get_crc_table z_get_crc_table +# ifndef Z_SOLO +# define gz_error z_gz_error +# define gz_intmax z_gz_intmax +# define gz_strwinerror z_gz_strwinerror +# define gzbuffer z_gzbuffer +# define gzclearerr z_gzclearerr +# define gzclose z_gzclose +# define gzclose_r z_gzclose_r +# define gzclose_w z_gzclose_w +# define gzdirect z_gzdirect +# define gzdopen z_gzdopen +# define gzeof z_gzeof +# define gzerror z_gzerror +# define gzflush z_gzflush +# define gzgetc z_gzgetc +# define gzgetc_ z_gzgetc_ +# define gzgets z_gzgets +# define gzoffset z_gzoffset +# define gzoffset64 z_gzoffset64 +# define gzopen z_gzopen +# define gzopen64 z_gzopen64 +# ifdef _WIN32 +# define gzopen_w z_gzopen_w +# endif +# define gzprintf z_gzprintf +# define gzvprintf z_gzvprintf +# define gzputc z_gzputc +# define gzputs z_gzputs +# define gzread z_gzread +# define gzrewind z_gzrewind +# define gzseek z_gzseek +# define gzseek64 z_gzseek64 +# define gzsetparams z_gzsetparams +# define gztell z_gztell +# define gztell64 z_gztell64 +# define gzungetc z_gzungetc +# define gzwrite z_gzwrite +# endif +# define inflate z_inflate +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define inflateBackInit_ z_inflateBackInit_ +# define inflateCopy z_inflateCopy +# define inflateEnd z_inflateEnd +# define inflateGetHeader z_inflateGetHeader +# define inflateInit2_ z_inflateInit2_ +# define inflateInit_ z_inflateInit_ +# define inflateMark z_inflateMark +# define inflatePrime z_inflatePrime +# define inflateReset z_inflateReset +# define inflateReset2 z_inflateReset2 +# define inflateSetDictionary z_inflateSetDictionary +# define inflateGetDictionary z_inflateGetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateUndermine z_inflateUndermine +# define inflateResetKeep z_inflateResetKeep +# define inflate_copyright z_inflate_copyright +# define inflate_fast z_inflate_fast +# define inflate_table z_inflate_table +# ifndef Z_SOLO +# define uncompress z_uncompress +# endif +# define zError z_zError +# ifndef Z_SOLO +# define zcalloc z_zcalloc +# define zcfree z_zcfree +# endif +# define zlibCompileFlags z_zlibCompileFlags +# define zlibVersion z_zlibVersion + +/* all zlib typedefs in zlib.h and zconf.h */ +# define Byte z_Byte +# define Bytef z_Bytef +# define alloc_func z_alloc_func +# define charf z_charf +# define free_func z_free_func +# ifndef Z_SOLO +# define gzFile z_gzFile +# endif +# define gz_header z_gz_header +# define gz_headerp z_gz_headerp +# define in_func z_in_func +# define intf z_intf +# define out_func z_out_func +# define uInt z_uInt +# define uIntf z_uIntf +# define uLong z_uLong +# define uLongf z_uLongf +# define voidp z_voidp +# define voidpc z_voidpc +# define voidpf z_voidpf + +/* all zlib structs in zlib.h and zconf.h */ +# define gz_header_s z_gz_header_s +# define internal_state z_internal_state + +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +#if defined(ZLIB_CONST) && !defined(z_const) +# define z_const const +#else +# define z_const +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +#ifndef Z_ARG /* function prototypes for stdarg */ +# if defined(STDC) || defined(Z_HAVE_STDARG_H) +# define Z_ARG(args) args +# else +# define Z_ARG(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef unsigned char Byte; /* 8 bits */ +#endif +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC) +# include +# if (UINT_MAX == 0xffffffffUL) +# define Z_U4 unsigned +# elif (ULONG_MAX == 0xffffffffUL) +# define Z_U4 unsigned long +# elif (USHRT_MAX == 0xffffffffUL) +# define Z_U4 unsigned short +# endif +#endif + +#ifdef Z_U4 + typedef Z_U4 z_crc_t; +#else + typedef unsigned long z_crc_t; +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_UNISTD_H +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_STDARG_H +#endif + +#ifdef STDC +# ifndef Z_SOLO +# include /* for off_t */ +# endif +#endif + +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +# include /* for va_list */ +# endif +#endif + +#ifdef _WIN32 +# ifndef Z_SOLO +# include /* for wchar_t */ +# endif +#endif + +/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and + * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even + * though the former does not conform to the LFS document), but considering + * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as + * equivalently requesting no 64-bit operations + */ +#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 +# undef _LARGEFILE64_SOURCE +#endif + +#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H) +# define Z_HAVE_UNISTD_H +#endif +#ifndef Z_SOLO +# if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef VMS +# include /* for off_t */ +# endif +# ifndef z_off_t +# define z_off_t off_t +# endif +# endif +#endif + +#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 +# define Z_LFS64 +#endif + +#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) +# define Z_LARGE64 +#endif + +#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) +# define Z_WANT64 +#endif + +#if !defined(SEEK_SET) && !defined(Z_SOLO) +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif + +#ifndef z_off_t +# define z_off_t long +#endif + +#if !defined(_WIN32) && defined(Z_LARGE64) +# define z_off64_t off64_t +#else +# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO) +# define z_off64_t __int64 +# else +# define z_off64_t z_off_t +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) + #pragma map(deflateInit_,"DEIN") + #pragma map(deflateInit2_,"DEIN2") + #pragma map(deflateEnd,"DEEND") + #pragma map(deflateBound,"DEBND") + #pragma map(inflateInit_,"ININ") + #pragma map(inflateInit2_,"ININ2") + #pragma map(inflateEnd,"INEND") + #pragma map(inflateSync,"INSY") + #pragma map(inflateSetDictionary,"INSEDI") + #pragma map(compressBound,"CMBND") + #pragma map(inflate_table,"INTABL") + #pragma map(inflate_fast,"INFA") + #pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */ diff --git a/src/inc/zlib/zlib.h b/src/inc/zlib/zlib.h new file mode 100644 index 00000000..3e0c7672 --- /dev/null +++ b/src/inc/zlib/zlib.h @@ -0,0 +1,1768 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.8, April 28th, 2013 + + Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950 + (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format). +*/ + +#ifndef ZLIB_H +#define ZLIB_H + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.8" +#define ZLIB_VERNUM 0x1280 +#define ZLIB_VER_MAJOR 1 +#define ZLIB_VER_MINOR 2 +#define ZLIB_VER_REVISION 8 +#define ZLIB_VER_SUBREVISION 0 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed data. + This version of the library supports only one compression method (deflation) + but other algorithms will be added later and will have the same stream + interface. + + Compression can be done in a single step if the buffers are large enough, + or can be done by repeated calls of the compression function. In the latter + case, the application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never crash + even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + z_const Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total number of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total number of bytes output so far */ + + z_const char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has dropped + to zero. It must update next_out and avail_out when avail_out has dropped + to zero. The application must initialize zalloc, zfree and opaque before + calling the init function. All other fields are set by the compression + library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this if + the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, pointers + returned by zalloc for objects of exactly 65536 bytes *must* have their + offset normalized to zero. The default allocation function provided by this + library ensures this (see zutil.c). To reduce memory requirements and avoid + any allocation of 64K objects, at the expense of compression ratio, compile + the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or progress + reports. After compression, total_in holds the total size of the + uncompressed data and may be saved for use in the decompressor (particularly + if the decompressor wants to decompress everything in a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +#define Z_TREES 6 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative values + * are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is not + compatible with the zlib.h header file used by the application. This check + is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. If + zalloc and zfree are set to Z_NULL, deflateInit updates them to use default + allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at all + (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION + requests a default compromise between speed and compression (currently + equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if level is not a valid compression level, or + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). msg is set to null + if there is no error message. deflateInit does not perform any compression: + this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). Some + output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming more + output, and updating avail_in or avail_out accordingly; avail_out should + never be zero before the call. The application can consume the compressed + output when it wants, for example when the output buffer is full (avail_out + == 0), or after each call of deflate(). If deflate returns Z_OK and with + zero avail_out, it must be called again after making room in the output + buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumulate before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In + particular avail_in is zero after the call if enough output space has been + provided before the call.) Flushing may degrade compression for some + compression algorithms and so it should be used only when necessary. This + completes the current deflate block and follows it with an empty stored block + that is three bits plus filler bits to the next byte, followed by four bytes + (00 00 ff ff). + + If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the + output buffer, but the output is not aligned to a byte boundary. All of the + input data so far will be available to the decompressor, as for Z_SYNC_FLUSH. + This completes the current deflate block and follows it with an empty fixed + codes block that is 10 bits long. This assures that enough bytes are output + in order for the decompressor to finish the block before the empty fixed code + block. + + If flush is set to Z_BLOCK, a deflate block is completed and emitted, as + for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to + seven bits of the current block are held to be written as the next byte after + the next deflate block is completed. In this case, the decompressor may not + be provided enough bits at this point in order to complete decompression of + the data provided so far to the compressor. It may need to wait for the next + block to be emitted. This is for advanced applications that need to control + the emission of deflate blocks. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there was + enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the stream + are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least the + value returned by deflateBound (see below). Then deflate is guaranteed to + return Z_STREAM_END. If not enough output space is provided, deflate will + not return Z_STREAM_END, and it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect the + compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was Z_NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any pending + output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, msg + may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the + exact value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller, or Z_STREAM_ERROR if the parameters are + invalid, such as a null pointer to the structure. msg is set to null if + there is no error message. inflateInit does not perform any decompression + apart from possibly reading the zlib header if present: actual decompression + will be done by inflate(). (So next_in and avail_in may be modified, but + next_out and avail_out are unused and unchanged.) The current implementation + of inflateInit() does not process any header information -- that is deferred + until inflate() is called. +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing will + resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there is + no more input data or no more space in the output buffer (see below about + the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming more + output, and updating the next_* and avail_* values accordingly. The + application can consume the uncompressed output when it wants, for example + when the output buffer is full (avail_out == 0), or after each call of + inflate(). If inflate returns Z_OK and with zero avail_out, it must be + called again after making room in the output buffer because there might be + more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH, + Z_BLOCK, or Z_TREES. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() + stop if and when it gets to the next deflate block boundary. When decoding + the zlib or gzip format, this will cause inflate() to return immediately + after the header and before the first block. When doing a raw inflate, + inflate() will go ahead and process the first block, and will return when it + gets to the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 if + inflate() is currently decoding the last block in the deflate stream, plus + 128 if inflate() returned immediately after decoding an end-of-block code or + decoding the complete header up to just before the first byte of the deflate + stream. The end-of-block will not be indicated until all of the uncompressed + data from that block has been written to strm->next_out. The number of + unused bits may in general be greater than seven, except when bit 7 of + data_type is set, in which case the number of unused bits will be less than + eight. data_type is set as noted here every time inflate() returns for all + flush options, and so can be used to determine the amount of currently + consumed input in bits. + + The Z_TREES option behaves as Z_BLOCK does, but it also returns when the + end of each deflate block header is reached, before any actual data in that + block is decoded. This allows the caller to determine the length of the + deflate block header for later use in random access within a deflate block. + 256 is added to the value of strm->data_type when inflate() returns + immediately after reaching the end of the deflate block header. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step (a + single call of inflate), the parameter flush should be set to Z_FINISH. In + this case all pending input is processed and all pending output is flushed; + avail_out must be large enough to hold all of the uncompressed data for the + operation to complete. (The size of the uncompressed data may have been + saved by the compressor for this purpose.) The use of Z_FINISH is not + required to perform an inflation in one step. However it may be used to + inform inflate that a faster approach can be used for the single inflate() + call. Z_FINISH also informs inflate to not maintain a sliding window if the + stream completes, which reduces inflate's memory footprint. If the stream + does not complete, either because not all of the stream is provided or not + enough output space is provided, then a sliding window will be allocated and + inflate() can be called again to continue the operation as if Z_NO_FLUSH had + been used. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the effects of the flush parameter in this implementation are + on the return value of inflate() as noted below, when inflate() returns early + when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of + memory for a sliding window when Z_FINISH is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the Adler-32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the Adler-32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() can decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically, if requested when + initializing with inflateInit2(). Any information contained in the gzip + header is not retained, so applications that need that information should + instead use raw inflate, see inflateInit2() below, or inflateBack() and + perform their own processing of the gzip header and trailer. When processing + gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output + producted so far. The CRC-32 is checked against the gzip trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + next_in or next_out was Z_NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may + then call inflateSync() to look for a good compression block if a partial + recovery of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any pending + output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by the + caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), no + header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but is + slow and reduces compression ratio; memLevel=9 uses maximum memory for + optimal speed. The default value is 8. See zconf.h for total memory usage + as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as + fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data. The + strategy parameter only affects the compression ratio but not the + correctness of the compressed output even if it is not set appropriately. + Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler + decoder for special applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid + method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is + incompatible with the version assumed by the caller (ZLIB_VERSION). msg is + set to null if there is no error message. deflateInit2 does not perform any + compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. When using the zlib format, this + function must be called immediately after deflateInit, deflateInit2 or + deflateReset, and before any call of deflate. When doing raw deflate, this + function must be called either before any call of deflate, or immediately + after the completion of a deflate block, i.e. after all input has been + consumed and all output has been delivered when using any of the flush + options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH. The + compressor and decompressor must use exactly the same dictionary (see + inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size + provided in deflateInit or deflateInit2. Thus the strings most likely to be + useful should be put at the end of the dictionary, not at the front. In + addition, the current implementation of deflate will use at most the window + size minus 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if not at a block boundary for raw deflate). deflateSetDictionary does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and can + consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being Z_NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. The + stream will keep the same compression level and any other attributes that + may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different strategy. + If the compression level is changed, the input available so far is + compressed with the old level (and may be flushed); the new level will take + effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to be + compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR if + strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() or + deflateInit2(), and after deflateSetHeader(), if used. This would be used + to allocate an output buffer for deflation in a single pass, and so would be + called before deflate(). If that first deflate() call is provided the + sourceLen input bytes, an output buffer allocated to the size returned by + deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed + to return Z_STREAM_END. Note that it is possible for the compressed size to + be larger than the value returned by deflateBound() if flush options other + than Z_FINISH or Z_NO_FLUSH are used. +*/ + +ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm, + unsigned *pending, + int *bits)); +/* + deflatePending() returns the number of bytes and bits of output that have + been generated, but not yet provided in the available output. The bytes not + provided would be due to the available output space having being consumed. + The number of bits of output not provided are between 0 and 7, where they + await more bits to join them in order to fill out a full byte. If pending + or bits are Z_NULL, then those values are not set. + + deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. + */ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the bits + leftover from a previous deflate stream when appending to it. As such, this + function can only be used for raw deflate, and must be used before the first + deflate() call after a deflateInit2() or deflateReset(). bits must be less + than or equal to 16, and that many of the least significant bits of value + will be inserted in the output. + + deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough + room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the + source stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be zero to request that inflate use the window size in + the zlib header of the compressed stream. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is a + crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller, or Z_STREAM_ERROR if the parameters are + invalid, such as a null pointer to the structure. msg is set to null if + there is no error message. inflateInit2 does not perform any decompression + apart from possibly reading the zlib header if present: actual decompression + will be done by inflate(). (So next_in and avail_in may be modified, but + next_out and avail_out are unused and unchanged.) The current implementation + of inflateInit2() does not process any header information -- that is + deferred until inflate() is called. +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called at any + time to set the dictionary. If the provided dictionary is smaller than the + window and there is already data in the window, then the provided dictionary + will amend what's there. The application must insure that the dictionary + that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm, + Bytef *dictionary, + uInt *dictLength)); +/* + Returns the sliding dictionary being maintained by inflate. dictLength is + set to the number of bytes in the dictionary, and that many bytes are copied + to dictionary. dictionary must have enough space, where 32768 bytes is + always enough. If inflateGetDictionary() is called with dictionary equal to + Z_NULL, then only the dictionary length is returned, and nothing is copied. + Similary, if dictLength is Z_NULL, then it is not set. + + inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the + stream state is inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a possible full flush point (see above + for the description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync searches for a 00 00 FF FF pattern in the compressed data. + All full flush points have this pattern, but not all occurrences of this + pattern are full flush points. + + inflateSync returns Z_OK if a possible full flush point has been found, + Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point + has been found, or Z_STREAM_ERROR if the stream structure was inconsistent. + In the success case, the application may save the current current value of + total_in which indicates where valid compressed data was found. In the + error case, the application may repeatedly call inflateSync, providing more + input each time, until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being Z_NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. The + stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL). +*/ + +ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm, + int windowBits)); +/* + This function is the same as inflateReset, but it also permits changing + the wrap and window size requests. The windowBits parameter is interpreted + the same as it is for inflateInit2. + + inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being Z_NULL), or if + the windowBits parameter is invalid. +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + If bits is negative, then the input stream bit buffer is emptied. Then + inflatePrime() can be called again to put bits in the buffer. This is used + to clear out bits leftover after feeding inflate a block description prior + to feeding inflate codes. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm)); +/* + This function returns two values, one in the lower 16 bits of the return + value, and the other in the remaining upper bits, obtained by shifting the + return value down 16 bits. If the upper value is -1 and the lower value is + zero, then inflate() is currently decoding information outside of a block. + If the upper value is -1 and the lower value is non-zero, then inflate is in + the middle of a stored block, with the lower value equaling the number of + bytes from the input remaining to copy. If the upper value is not -1, then + it is the number of bits back from the current bit position in the input of + the code (literal or length/distance pair) currently being processed. In + that case the lower value is the number of bytes already emitted for that + code. + + A code is being processed if inflate is waiting for more input to complete + decoding of the code, or if it has completed decoding but is waiting for + more output space to write the literal or match data. + + inflateMark() is used to mark locations in the input data for random + access, which may be at bit positions, and to note those cases where the + output of a code may span boundaries of random access blocks. The current + location in the input stream can be determined from avail_in and data_type + as noted in the description for the Z_BLOCK flush parameter for inflate. + + inflateMark returns the value noted above or -1 << 16 if the provided + source stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK or Z_TREES can be + used to force inflate() to return immediately after header processing is + complete and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When any + of extra, name, or comment are not Z_NULL and the respective field is not + present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the parameters are invalid, Z_MEM_ERROR if the internal state could not be + allocated, or Z_VERSION_ERROR if the version of the library does not match + the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, + z_const unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is potentially more efficient than + inflate() for file i/o applications, in that it avoids copying between the + output and the sliding window by simply making the window itself the output + buffer. inflate() can be faster on modern CPUs when used with large + buffers. inflateBack() trusts the application to not change the output + buffer passed by the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free the + allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects only + the raw deflate stream to decompress. This is different from the normal + behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format error + in the deflate stream (in which case strm->msg is set to indicate the nature + of the error), or Z_STREAM_ERROR if the stream was not properly initialized. + In the case of Z_BUF_ERROR, an input or output error can be distinguished + using strm->next_in which will be Z_NULL only if in() returned an error. If + strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning + non-zero. (in() will always be called before out(), so strm->next_in is + assured to be defined if out() returns non-zero.) Note that inflateBack() + cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + +#ifndef Z_SOLO + + /* utility functions */ + +/* + The following utility functions are implemented on top of the basic + stream-oriented functions. To simplify the interface, some default options + are assumed (compression level and memory usage, standard memory allocation + functions). The source code of these utility functions can be modified if + you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total size + of the destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before a + compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total size + of the destination buffer, which must be large enough to hold the entire + uncompressed data. (The size of the uncompressed data must have been saved + previously by the compressor and transmitted to the decompressor by some + mechanism outside the scope of this compression library.) Upon exit, destLen + is the actual size of the uncompressed buffer. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. In + the case where there is not enough room, uncompress() will fill the output + buffer with the uncompressed data up to that point. +*/ + + /* gzip file access functions */ + +/* + This library supports reading and writing files in gzip (.gz) format with + an interface similar to that of stdio, using the functions that start with + "gz". The gzip format is different from the zlib format. gzip is a gzip + wrapper, documented in RFC 1952, wrapped around a deflate stream. +*/ + +typedef struct gzFile_s *gzFile; /* semi-opaque gzip file descriptor */ + +/* +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); + + Opens a gzip (.gz) file for reading or writing. The mode parameter is as + in fopen ("rb" or "wb") but can also include a compression level ("wb9") or + a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only + compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F' + for fixed code compression as in "wb9F". (See the description of + deflateInit2 for more information about the strategy parameter.) 'T' will + request transparent writing or appending with no compression and not using + the gzip format. + + "a" can be used instead of "w" to request that the gzip stream that will + be written be appended to the file. "+" will result in an error, since + reading and writing to the same gzip file is not supported. The addition of + "x" when writing will create the file exclusively, which fails if the file + already exists. On systems that support it, the addition of "e" when + reading or writing will set the flag to close the file on an execve() call. + + These functions, as well as gzip, will read and decode a sequence of gzip + streams in a file. The append function of gzopen() can be used to create + such a file. (Also see gzflush() for another way to do this.) When + appending, gzopen does not test whether the file begins with a gzip stream, + nor does it look for the end of the gzip streams to begin appending. gzopen + will simply append a gzip stream to the existing file. + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. When + reading, this will be detected automatically by looking for the magic two- + byte gzip header. + + gzopen returns NULL if the file could not be opened, if there was + insufficient memory to allocate the gzFile state, or if an invalid mode was + specified (an 'r', 'w', or 'a' was not provided, or '+' was provided). + errno can be checked to determine if the reason gzopen failed was that the + file could not be opened. +*/ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen associates a gzFile with the file descriptor fd. File descriptors + are obtained from calls like open, dup, creat, pipe or fileno (if the file + has been previously opened with fopen). The mode parameter is as in gzopen. + + The next call of gzclose on the returned gzFile will also close the file + descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor + fd. If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd, + mode);. The duplicated descriptor should be saved to avoid a leak, since + gzdopen does not close fd if it fails. If you are using fileno() to get the + file descriptor from a FILE *, then you will have to use dup() to avoid + double-close()ing the file descriptor. Both gzclose() and fclose() will + close the associated file descriptor, so they need to have different file + descriptors. + + gzdopen returns NULL if there was insufficient memory to allocate the + gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not + provided, or '+' was provided), or if fd is -1. The file descriptor is not + used until the next gz* read, write, seek, or close operation, so gzdopen + will not detect if fd is invalid (unless fd is -1). +*/ + +ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size)); +/* + Set the internal buffer size used by this library's functions. The + default buffer size is 8192 bytes. This function must be called after + gzopen() or gzdopen(), and before any other calls that read or write the + file. The buffer memory allocation is always deferred to the first read or + write. Two buffers are allocated, either both of the specified size when + writing, or one of the specified size and the other twice that size when + reading. A larger buffer size of, for example, 64K or 128K bytes will + noticeably increase the speed of decompression (reading). + + The new buffer size also affects the maximum length for gzprintf(). + + gzbuffer() returns 0 on success, or -1 on failure, such as being called + too late. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. If + the input file is not in gzip format, gzread copies the given number of + bytes into the buffer directly from the file. + + After reaching the end of a gzip stream in the input, gzread will continue + to read, looking for another gzip stream. Any number of gzip streams may be + concatenated in the input file, and will all be decompressed by gzread(). + If something other than a gzip stream is encountered after a gzip stream, + that remaining trailing garbage is ignored (and no error is returned). + + gzread can be used to read a gzip file that is being concurrently written. + Upon reaching the end of the input, gzread will return with the available + data. If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then + gzclearerr can be used to clear the end of file indicator in order to permit + gzread to be tried again. Z_OK indicates that a gzip stream was completed + on the last gzread. Z_BUF_ERROR indicates that the input file ended in the + middle of a gzip stream. Note that gzread does not return -1 in the event + of an incomplete gzip stream. This error is deferred until gzclose(), which + will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip + stream. Alternatively, gzerror can be used before gzclose to detect this + case. + + gzread returns the number of uncompressed bytes actually read, less than + len for end of file, or -1 for error. +*/ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes written or 0 in case of + error. +*/ + +ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the arguments to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written, or 0 in case of error. The number of + uncompressed bytes written is limited to 8191, or one less than the buffer + size given to gzbuffer(). The caller should assure that this limit is not + exceeded. If it is exceeded, then gzprintf() will return an error (0) with + nothing written. In this case, there may also be a buffer overflow with + unpredictable consequences, which is possible only if zlib was compiled with + the insecure functions sprintf() or vsprintf() because the secure snprintf() + or vsnprintf() functions were not available. This can be determined using + zlibCompileFlags(). +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or a + newline character is read and transferred to buf, or an end-of-file + condition is encountered. If any characters are read or if len == 1, the + string is terminated with a null character. If no characters are read due + to an end-of-file or len < 1, then the buffer is left untouched. + + gzgets returns buf which is a null-terminated string, or it returns NULL + for end-of-file or in case of error. If there was an error, the contents at + buf are indeterminate. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. gzputc + returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte or -1 + in case of end of file or error. This is implemented as a macro for speed. + As such, it does not do all of the checking the other functions do. I.e. + it does not check to see if file is NULL, nor whether the structure file + points to has been clobbered or not. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read as the first character + on the next read. At least one character of push-back is allowed. + gzungetc() returns the character pushed, or -1 on failure. gzungetc() will + fail if c is -1, and may fail if a character has been pushed but not read + yet. If gzungetc is used immediately after gzopen or gzdopen, at least the + output buffer size of pushed characters is allowed. (See gzbuffer above.) + The pushed character will be discarded if the stream is repositioned with + gzseek() or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter flush + is as in the deflate() function. The return value is the zlib error number + (see function gzerror below). gzflush is only permitted when writing. + + If the flush parameter is Z_FINISH, the remaining data is written and the + gzip stream is completed in the output. If gzwrite() is called again, a new + gzip stream will be started in the output. gzread() is able to read such + concatented gzip streams. + + gzflush should be called only when strictly necessary because it will + degrade compression if called too often. +*/ + +/* +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); + + Sets the starting position for the next gzread or gzwrite on the given + compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +/* +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); + + Returns the starting position for the next gzread or gzwrite on the given + compressed file. This position represents a number of bytes in the + uncompressed data stream, and is zero when starting, even if appending or + reading a gzip stream from the middle of a file using gzdopen(). + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +/* +ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file)); + + Returns the current offset in the file being read or written. This offset + includes the count of bytes that precede the gzip stream, for example when + appending or when using gzdopen() for reading. When reading, the offset + does not include as yet unused buffered input. This information can be used + for a progress indicator. On error, gzoffset() returns -1. +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns true (1) if the end-of-file indicator has been set while reading, + false (0) otherwise. Note that the end-of-file indicator is set only if the + read tried to go past the end of the input, but came up short. Therefore, + just like feof(), gzeof() may return false even if there is no more data to + read, in the event that the last read request was for the exact number of + bytes remaining in the input file. This will happen if the input file size + is an exact multiple of the buffer size. + + If gzeof() returns true, then the read functions will return no more data, + unless the end-of-file indicator is reset by gzclearerr() and the input file + has grown since the previous end of file was detected. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns true (1) if file is being copied directly while reading, or false + (0) if file is a gzip stream being decompressed. + + If the input file is empty, gzdirect() will return true, since the input + does not contain a gzip stream. + + If gzdirect() is used immediately after gzopen() or gzdopen() it will + cause buffers to be allocated to allow reading the file to determine if it + is a gzip file. Therefore if gzbuffer() is used, it should be called before + gzdirect(). + + When writing, gzdirect() returns true (1) if transparent writing was + requested ("wT" for the gzopen() mode), or false (0) otherwise. (Note: + gzdirect() is not needed when writing. Transparent writing must be + explicitly requested, so the application already knows the answer. When + linking statically, using gzdirect() will include all of the zlib code for + gzip file reading and decompression, which may not be desired.) +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file and + deallocates the (de)compression state. Note that once file is closed, you + cannot call gzerror with file, since its structures have been deallocated. + gzclose must not be called more than once on the same file, just as free + must not be called more than once on the same allocation. + + gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a + file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the + last read ended in the middle of a gzip stream, or Z_OK on success. +*/ + +ZEXTERN int ZEXPORT gzclose_r OF((gzFile file)); +ZEXTERN int ZEXPORT gzclose_w OF((gzFile file)); +/* + Same as gzclose(), but gzclose_r() is only for use when reading, and + gzclose_w() is only for use when writing or appending. The advantage to + using these instead of gzclose() is that they avoid linking in zlib + compression or decompression code that is not used when only reading or only + writing respectively. If gzclose() is used, then both compression and + decompression code will be included the application when linking to a static + zlib library. +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the given + compressed file. errnum is set to zlib error number. If an error occurred + in the file system and not in the compression library, errnum is set to + Z_ERRNO and the application may consult errno to get the exact error code. + + The application must not modify the returned string. Future calls to + this function may invalidate the previously returned string. If file is + closed, then the string previously returned by gzerror will no longer be + available. + + gzerror() should be used to distinguish errors from end-of-file for those + functions above that do not distinguish those cases in their return values. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + +#endif /* !Z_SOLO */ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the compression + library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is Z_NULL, this function returns the + required initial value for the checksum. + + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. + + Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +/* +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); + + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. Note + that the z_off_t type (like off_t) is a signed integer. If len2 is + negative, the result has no meaning or utility. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is Z_NULL, this function returns the required + initial value for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +/* +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, (int)sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, \ + (int)sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, (int)sizeof(z_stream)) + +#ifndef Z_SOLO + +/* gzgetc() macro and its supporting function and exposed data structure. Note + * that the real internal state is much larger than the exposed structure. + * This abbreviated structure exposes just enough for the gzgetc() macro. The + * user should not mess with these exposed elements, since their names or + * behavior could change in the future, perhaps even capriciously. They can + * only be used by the gzgetc() macro. You have been warned. + */ +struct gzFile_s { + unsigned have; + unsigned char *next; + z_off64_t pos; +}; +ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */ +#ifdef Z_PREFIX_SET +# undef z_gzgetc +# define z_gzgetc(g) \ + ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g)) +#else +# define gzgetc(g) \ + ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g)) +#endif + +/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or + * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if + * both are true, the application gets the *64 functions, and the regular + * functions are changed to 64 bits) -- in case these are set on systems + * without large file support, _LFS64_LARGEFILE must also be true + */ +#ifdef Z_LARGE64 + ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *)); + ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int)); + ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile)); + ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile)); + ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t)); + ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t)); +#endif + +#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64) +# ifdef Z_PREFIX_SET +# define z_gzopen z_gzopen64 +# define z_gzseek z_gzseek64 +# define z_gztell z_gztell64 +# define z_gzoffset z_gzoffset64 +# define z_adler32_combine z_adler32_combine64 +# define z_crc32_combine z_crc32_combine64 +# else +# define gzopen gzopen64 +# define gzseek gzseek64 +# define gztell gztell64 +# define gzoffset gzoffset64 +# define adler32_combine adler32_combine64 +# define crc32_combine crc32_combine64 +# endif +# ifndef Z_LARGE64 + ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *)); + ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int)); + ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile)); + ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile)); + ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t)); + ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t)); +# endif +#else + ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *)); + ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int)); + ZEXTERN z_off_t ZEXPORT gztell OF((gzFile)); + ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile)); + ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t)); + ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t)); +#endif + +#else /* Z_SOLO */ + + ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t)); + ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t)); + +#endif /* !Z_SOLO */ + +/* hack for buggy compilers */ +#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; +#endif + +/* undocumented functions */ +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp)); +ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table OF((void)); +ZEXTERN int ZEXPORT inflateUndermine OF((z_streamp, int)); +ZEXTERN int ZEXPORT inflateResetKeep OF((z_streamp)); +ZEXTERN int ZEXPORT deflateResetKeep OF((z_streamp)); +#if defined(_WIN32) && !defined(Z_SOLO) +ZEXTERN gzFile ZEXPORT gzopen_w OF((const wchar_t *path, + const char *mode)); +#endif +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +ZEXTERN int ZEXPORTVA gzvprintf Z_ARG((gzFile file, + const char *format, + va_list va)); +# endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* ZLIB_H */ diff --git a/src/kat.cc b/src/kat.cc new file mode 100644 index 00000000..03efecca --- /dev/null +++ b/src/kat.cc @@ -0,0 +1,34 @@ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "kat_args.hpp"; +#include "sect/sect.hpp"; + + +// Start point +int main(int argc, char *argv[]) +{ + // Parse args + KatArgs args(argc, argv); + + // Print command line args to stderr if requested + if (args.verbose) + { + args.print(); + } + + + // Pass remaining args to relevant child tool + if (strncmp(args.getMode(), "sect")) + { + sectStart(args.getModeArgC(), args.getModeArgV()); + } + else if (strncmp(args.getMode(), "comp")) + { + compStart(args.getModeArgC(), args.getModeArgV()); + } + + + return 0; +} diff --git a/src/kat_args.hpp b/src/kat_args.hpp new file mode 100644 index 00000000..3e8fde3f --- /dev/null +++ b/src/kat_args.hpp @@ -0,0 +1,156 @@ +#ifndef __SECT_ARGS_HPP__ +#define __SECT_ARGS_HPP__ + +#include +#include + +class KatArgs +{ +private: + string mode_arg; + int mode_argc; + char* mode_argv[]; + +public: + + + // Default constructor + SectArgs() : + mode_arg(NULL), mode_argc(0), mode_argv(NULL) + {} + + // Constructor that parses command line options + SectArgs(int argc, char* argv[]) : + mode_arg(NULL), mode_argc(0), mode_argv(NULL) + { + parse(argc, argv); + } + + string getMode() { + return mode_arg; + } + + int getModeArgC() { + return mode_argc; + } + + char** getModeArgV() { + return mode_argv; + } + + + + +#define kat_args_USAGE "Usage: kat " + const char * usage() const + { + return kat_args_USAGE; + } + + void error(const char *msg) + { + std::cerr << "Error: " << msg << "\n" << usage() + << "\nUse --help for more information" + << std::endl; + exit(1); + } + + +#define kat_args_HELP "The Kmer Analysis Toolkist (KAT) contains a number of tools that analyse jellyfish kmer hashes\n\n" \ + "First argument should be the tool you wish to use: " \ + "Options (default value in (), *required):\n" \ + " --usage Usage\n" \ + " --help This message\n" \ + " -V, --version Version" + + const char * help() const + { + return kat_args_HELP; + } + +#define kat_args_HIDDEN "Hidden options:" + const char * hidden() const + { + return kat_args_HIDDEN; + } + + void print_version(std::ostream &os = std::cout) const + { +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.1.0" +#endif + os << PACKAGE_VERSION << "\n"; + } + + void parse(int argc, char *argv[]) + { + int c; + + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"usage", no_argument, 0, 'u'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0} + }; + + static const char *short_options = "Vuh"; + + + if (argc == 0) { + std::cout << usage() << "\n\n" << help() << std::endl; + exit(0); + } + else if (validMode(argv[0])) { + mode_arg = argv[0]; + mode_argc = argc -1; + mode_argv = &argv[1]; + } + else { + + while (true) + { + /* getopt_long stores the option index here. */ + int index = -1; + + c = getopt_long (argc, argv, short_options, long_options, &index); + + /* Detect the end of the options. */ + if (c == -1) + break; + + switch (c) + { + case ':': + std::cerr << "Missing required argument for " + << (index == -1 ? std::string(1, (char)optopt) : std::string(long_options[index].name)) + << std::endl; + exit(1); + case 'h': + std::cout << usage() << "\n\n" << help() << std::endl; + exit(0); + case 'u': + std::cout << usage() << "\nUse --help for more information." << std::endl; + exit(0); + case 'V': + print_version(); + exit(0); + case '?': + std::cerr << "Use --usage or --help for some help\n"; + exit(1); + + } + } + } + } + + + void print() + { + } + +private: +}; + +#endif // __SECT_ARGS_HPP__ + diff --git a/src/sect/.deps/.gitignore b/src/sect/.deps/.gitignore new file mode 100644 index 00000000..bcce16fd --- /dev/null +++ b/src/sect/.deps/.gitignore @@ -0,0 +1,2 @@ +/*.Po +/*.dirstamp diff --git a/src/sect/.gitignore b/src/sect/.gitignore new file mode 100644 index 00000000..b3fb5e09 --- /dev/null +++ b/src/sect/.gitignore @@ -0,0 +1,2 @@ +/*.o +/*.dirstamp diff --git a/src/sect/sect.hpp b/src/sect/sect.hpp new file mode 100644 index 00000000..2a027c5f --- /dev/null +++ b/src/sect/sect.hpp @@ -0,0 +1,193 @@ +#if !defined(DEF_SECT_H) +#define DEF_SECT_H + +#include +#include +#include +#include + +#include +#include + +using std::vector; +using std::string; + +template +class Sect : public thread_exec +{ + const hash_t *hash; // Jellyfish hash + const vector *names; // Names of fasta sequences (in same order as seqs) + const vector *seqs; // Sequences in fasta sequences (in same order as names) + const uint_t kmer; // Kmer size to use + const uint_t threads; // Number of threads to use + const size_t bucket_size, remaining; // Chunking vars + vector*> *counts; // Kmer counts for each kmer window in sequence (in same order as seqs and names; built by this class) + vector *coverages; // Overall coverage calculated for each sequence from the kmer windows. + +public: + Sect(const hash_t *_hash, const vector *_names, const vector *_seqs, + uint_t _kmer, uint_t _threads) : + hash(_hash), names(_names), seqs(_seqs), + kmer(_kmer), threads(_threads), + bucket_size(seqs->size() / threads), + remaining(seqs->size() % (bucket_size < 1 ? 1 : threads)) + { + counts = new vector*>(seqs->size()); + coverages = new vector(seqs->size()); + } + + ~Sect() + { + if(counts) + { + for(uint_t i = 0; i < counts->size(); i++) + { + delete (*counts)[i]; + } + delete counts; + } + + if (coverages) + delete coverages; + } + + + void do_it() + { + exec_join(threads); + } + + void start(int th_id) + { + // Check to see if we have useful work to do for this thread, return if not + if (bucket_size < 1 && th_id >= seqs->size()) + { + return; + } + + //processInBlocks(th_id); + processInterlaced(th_id); + + } + + + void printVars() + { + std::cerr << "Sequences to process: " << seqs->size() << "\n"; + std::cerr << "Kmer: " << kmer << "\n"; + std::cerr << "Threads: " << threads << "\n"; + std::cerr << "Bucket size: " << bucket_size << "\n"; + std::cerr << "Remaining: " << remaining << "\n"; + } + + + void printCounts(std::ostream &out) + { + for(int i = 0; i < names->size(); i++) + { + out << ">" << (*names)[i].c_str() << "\n"; + + vector* seqCounts = (*counts)[i]; + + if (seqCounts != NULL) + { + for(uint_t j = 0; j < seqCounts->size(); j++) + { + out << " " << (*seqCounts)[j]; + } + out << "\n"; + } + else + { + out << " 0\n"; + } + } + } + + void printCoverages(std::ostream &out) + { + for(int i = 0; i < names->size(); i++) + { + out << (*coverages)[i] << "\n"; + } + } + + +private: + + // This method won't be optimal in most cases... Fasta files are normally sorted by length (largest first) + // So first thread will be asked to do more work than the rest + void processInBlocks(uint_t th_id) + { + size_t start = bucket_size < 1 ? th_id : th_id * bucket_size; + size_t end = bucket_size < 1 ? th_id : start + bucket_size - 1; + for(size_t i = start; i <= end; i++) + { + processSeq(i); + } + + // Process a remainder if required + if (th_id < remaining) + { + size_t rem_idx = (threads * bucket_size) + th_id; + processSeq(rem_idx); + } + } + + // This method is probably makes more efficient use of multiple cores on a length sorted fasta file + void processInterlaced(uint_t th_id) + { + size_t start = th_id; + size_t end = seqs->size(); + for(size_t i = start; i < end; i+=threads) + { + processSeq(i); + } + } + + void processSeq(const size_t index) + { + string seq = (*seqs)[index]; + uint_t nbCounts = seq.length() - kmer + 1; + + if (seq.length() < kmer) + { + std::cerr << (*names)[index].c_str() << ": " << seq << " is too short to compute coverage. Setting sequence coverage to 0.\n"; + return; + } + + std::cerr << "Seq: " << seq << "; Counts to create: " << nbCounts << "\n"; + + vector* seqCounts = new vector(nbCounts); + + uint64_t sum = 0; + + for(uint_t i = 0; i < nbCounts; i++) + { + + string merstr = seq.substr(i, kmer); + + // Jellyfish compacted hash does not support Ns so if we find one set this mer count to 0 + if (merstr.find("N") != string::npos) + { + (*seqCounts)[i] = 0; + } + else + { + const char* mer = merstr.c_str(); + uint_t count = (*hash)[mer]; + sum += count; + + (*seqCounts)[i] = count; + } + } + + (*counts)[index] = seqCounts; + + // Assumes simple mean calculation for sequence coverage for now... plug in Bernardo's method later. + (*coverages)[index] = (float)sum / (float)nbCounts; + } + +}; + +#endif //DEF_SECT_H diff --git a/src/sect/sect_args.hpp b/src/sect/sect_args.hpp new file mode 100644 index 00000000..f1e4eae2 --- /dev/null +++ b/src/sect/sect_args.hpp @@ -0,0 +1,188 @@ +#ifndef __SECT_ARGS_HPP__ +#define __SECT_ARGS_HPP__ + +#include + +class SectArgs +{ +public: + const char * fasta_arg; + const char * db_arg; + const char * input_type; + const char * output_arg; + uint_t threads_arg; + uint_t kmer_arg; + bool verbose; + + // Default constructor + SectArgs() : + output_arg(NULL), threads_arg(1), kmer_arg(31), verbose(false) + {} + + // Constructor that parses command line options + SectArgs(int argc, char* argv[]) : + output_arg(NULL), threads_arg(1), kmer_arg(31), verbose(false) + { + parse(argc, argv); + } + + + + +#define seqcvg_args_USAGE "Usage: sect [options] -f db_path" + const char * usage() const + { + return seqcvg_args_USAGE; + } + + void error(const char *msg) + { + std::cerr << "Error: " << msg << "\n" << usage() + << "\nUse --help for more information" + << std::endl; + exit(1); + } + + +#define sect_args_HELP "Estimates coverage for sequences in a fasta file using jellyfish kmer counts\n\n" \ + "Options (default value in (), *required):\n" \ + " -f, --fasta Fasta file contains sequences that should have coverage estimated. Kmers containing any Ns derived from sequences in the fasta files will have 0 coverage.\n" \ + " -o, --count_output File that should contain the sequence count profiles produced from this program. If not specified count data is not output\n" \ + " -t, --threads The number of threads to use. Default: 1\n" \ + " -k, --kmer The kmer size to use (must be the same as kmer sized used for jellyfish hash). Default: 31.\n" \ + " --usage Usage\n" \ + " --help This message\n" \ + " -V, --version Version" + + const char * help() const + { + return sect_args_HELP; + } + +#define sect_args_HIDDEN "Hidden options:" + const char * hidden() const + { + return sect_args_HIDDEN; + } + + void print_version(std::ostream &os = std::cout) const + { +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.1.0" +#endif + os << PACKAGE_VERSION << "\n"; + } + + void parse(int argc, char *argv[]) + { + int c; + + static struct option long_options[] = + { + {"verbose", no_argument, 0, 'v'}, + {"fasta", required_argument, 0, 'f'}, + {"count_output", required_argument, 0, 'o'}, + {"threads", required_argument, 0, 't'}, + {"kmer", required_argument, 0, 'k'}, + {"help", no_argument, 0, 'h'}, + {"usage", no_argument, 0, 'u'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0} + }; + + static const char *short_options = "f:o:t:k:Vvuh"; + + while (true) + { + /* getopt_long stores the option index here. */ + int index = -1; + + c = getopt_long (argc, argv, short_options, long_options, &index); + + /* Detect the end of the options. */ + if (c == -1) + break; + + switch (c) + { + case ':': + std::cerr << "Missing required argument for " + << (index == -1 ? std::string(1, (char)optopt) : std::string(long_options[index].name)) + << std::endl; + exit(1); + case 'h': + std::cout << usage() << "\n\n" << help() << std::endl; + exit(0); + case 'u': + std::cout << usage() << "\nUse --help for more information." << std::endl; + exit(0); + case 'V': + print_version(); + exit(0); + case '?': + std::cerr << "Use --usage or --help for some help\n"; + exit(1); + case 'v': + verbose = true; + break; + case 'f': + fasta_arg = optarg; + break; + case 'o': + output_arg = optarg; + break; + case 't': + threads_arg = atoi(optarg); + break; + case 'k': + kmer_arg = atoi(optarg); + break; + + } + } + + + // Parse arguments + if(argc - optind != 1) + error("Requires exactly 1 argument."); + db_arg = argv[optind++]; + } + + bool outputGiven() + { + return output_arg == NULL ? false : true; + } + + + void print() + { + if (verbose) + std::cerr << "Verbose flag set\n"; + + if (fasta_arg) + std::cerr << "Fasta file: " << fasta_arg << "\n"; + + if (threads_arg) + std::cerr << "Threads requested: " << threads_arg << "\n"; + + if (kmer_arg) + std::cerr << "Kmer size: " << kmer_arg << "\n"; + + if (db_arg) + std::cerr << "Jellyfish hash: " << db_arg << "\n"; + + if (outputGiven()) + { + std::cerr << "Count Output file provided: " << output_arg << "\n"; + } + else + { + std::cerr << "No output argument provided. Not outputing count information\n"; + } + } + +private: +}; + +#endif // __SECT_ARGS_HPP__ + diff --git a/src/sect/sect_main.cc b/src/sect/sect_main.cc new file mode 100644 index 00000000..3d87ae00 --- /dev/null +++ b/src/sect/sect_main.cc @@ -0,0 +1,156 @@ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sect_args.hpp" +#include "sect.hpp" + +using std::vector; +using std::string; + +KSEQ_INIT(gzFile, gzread) + + +// Loads Fasta file into memory. Two vectors hold names and sequences respectively. Assumes no reordering will +// take place +void readFasta(const char *fastaPath, vector& fastaNames, vector& fastaSeqs) +{ + std::cerr << "Fasta file load: " << fastaPath << "\n"; + + gzFile fp; + fp = gzopen(fastaPath, "r"); // STEP 2: open the file handler + kseq_t *seq = kseq_init(fp); // STEP 3: initialize seq + int l; + while ((l = kseq_read(seq)) >= 0) // STEP 4: read sequence + { + fastaNames.push_back(seq->name.s); + fastaSeqs.push_back(seq->seq.s); + } + printf("return value: %d\n", l); + kseq_destroy(seq); // STEP 5: destroy seq + gzclose(fp); // STEP 6: close the file handler +} + +// Debugging routine.. not required for normal use +void printFastaData(vector& names, vector& seqs) +{ + if (names.size() != seqs.size()) + { + printf("something went wrong!!\n"); + return; + } + + printf("Printing fasta data\n"); + + for (int i = 0; i < names.size(); i++) + { + printf("name: %s\n", names[i].c_str()); + printf("seq: %s\n", seqs[i].c_str()); + } +} + +// Start point +int sectStart(int argc, char *argv[]) +{ + // Parse args + SectArgs args(argc, argv); + + // Print command line args to stderr if requested + if (args.verbose) + { + args.print(); + } + + // Create handle to the memory mapped hash file into memory + mapped_file dbf(args.db_arg); + + // Advise kernel on how we will use this memory (i.e. random access and we'll + // need it available soon.) + dbf.random().will_need(); + + // Load enitre fasta file into memory + vector names; + vector seqs; + readFasta(args.fasta_arg, names, seqs); + + // Get jellyfish has type + char type[8]; + memcpy(type, dbf.base(), sizeof(type)); + + // Process data differently depending on jellyfish hash type + if(!strncmp(type, jellyfish::raw_hash::file_type, sizeof(type))) + { + std::cerr << "Raw hash detected but not supported yet\n"; + } + else if(!strncmp(type, jellyfish::compacted_hash::file_type, sizeof(type))) + { + if (args.verbose) + { + std::cerr << "Compacted hash detected. Setting up query structure.\n"; + } + + // Load the jellyfish hash object + hash_query_t hash(dbf); + + // Output jellyfish has details if requested + if (args.verbose) + { + std::cerr << "mer length = " << hash.get_mer_len() << "\n" + << "hash size = " << hash.get_size() << "\n" + << "max reprobe = " << hash.get_max_reprobe() << "\n" + << "matrix = " << hash.get_hash_matrix().xor_sum() << "\n" + << "inv_matrix = " << hash.get_hash_inverse_matrix().xor_sum() << "\n"; + } + + // Create the sequence coverage object + Sect sect(&hash, &names, &seqs, args.kmer_arg, args.threads_arg); + + // Output seqcvg parameters to stderr if requested + if (args.verbose) + { + sect.printVars(); + } + + // Do the work + sect.do_it(); + + // Send sequence kmer counts to file if requested + if (args.outputGiven()) + { + ofstream_default count_out(args.outputGiven() ? args.output_arg : 0, std::cout); + sect.printCounts(count_out); + count_out.close(); + } + + // Send sequence coverage scores to standard out + ofstream_default out(0, std::cout); + sect.printCoverages(out); + out.close(); + } + else + { + die << "Invalid jellyfish file type '" << err::substr(type, sizeof(type)) << "'."; + } + + return 0; +}