diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index f814bb0..18f386b 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -9,7 +9,7 @@ RUN apt-get install -y \ # Packages for local testing RUN pip install --upgrade pip setuptools wheel -RUN pip install pytest h5py numpy +RUN pip install pytest numpy # Packages for local build RUN pip install cmake ninja @@ -25,5 +25,4 @@ RUN apt-get install -y \ RUN apt-get install -y \ python3-dev \ libboost-dev \ - libeigen3-dev \ pybind11-dev diff --git a/.github/workflows/build-cpp-libs.yml b/.github/workflows/build-cpp-libs.yml index c3b3dfa..05e8cd5 100644 --- a/.github/workflows/build-cpp-libs.yml +++ b/.github/workflows/build-cpp-libs.yml @@ -19,16 +19,16 @@ jobs: include: - os: ubuntu-22.04 build-type: Debug - system-deps: "sudo apt -y install libboost-dev libboost-math-dev libeigen3-dev" + system-deps: "sudo apt -y install libboost-dev libboost-math-dev" - os: ubuntu-22.04 build-type: Release - system-deps: "sudo apt -y install libboost-dev libboost-math-dev libeigen3-dev" + system-deps: "sudo apt -y install libboost-dev libboost-math-dev" - os: macos-14 build-type: Debug - system-deps: "brew install boost eigen" + system-deps: "brew install boost" - os: macos-14 build-type: Release - system-deps: "brew install boost eigen" + system-deps: "brew install boost" steps: diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index f80bc56..02fc999 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -27,12 +27,12 @@ jobs: - os: macos-13 arch: x86_64 py-vers: cp38-* cp39-* cp310-* cp311-* cp312-* - before-all: brew install cmake ninja boost eigen + before-all: brew install cmake ninja boost extra-env: CC="$(brew --prefix llvm@15)/bin/clang" CXX="$(brew --prefix llvm@15)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1 - os: macos-14 arch: arm64 py-vers: cp39-* cp310-* cp311-* cp312-* - before-all: brew install cmake ninja boost eigen + before-all: brew install cmake ninja boost extra-env: CC="$(brew --prefix llvm@15)/bin/clang" CXX="$(brew --prefix llvm@15)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1 env: diff --git a/.github/workflows/ctest-macos.yml b/.github/workflows/ctest-macos.yml index baf3af5..f0a8882 100644 --- a/.github/workflows/ctest-macos.yml +++ b/.github/workflows/ctest-macos.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@v2 - name: install system packages - run: brew install boost eigen + run: brew install boost - name: make build directory run: mkdir build_dir diff --git a/.github/workflows/ctest-ubuntu.yml b/.github/workflows/ctest-ubuntu.yml index e7cc5c0..dcd7e2f 100644 --- a/.github/workflows/ctest-ubuntu.yml +++ b/.github/workflows/ctest-ubuntu.yml @@ -21,7 +21,7 @@ jobs: - name: install system packages run: | sudo apt -y update - sudo apt -y install libboost-dev libeigen3-dev + sudo apt -y install libboost-dev - name: make build directory run: mkdir build_dir diff --git a/.github/workflows/pytest-ubuntu.yml b/.github/workflows/pytest-ubuntu.yml index c809702..7395f96 100644 --- a/.github/workflows/pytest-ubuntu.yml +++ b/.github/workflows/pytest-ubuntu.yml @@ -38,7 +38,7 @@ jobs: - name: install system packages run: | sudo apt -y update - sudo apt -y install libboost-dev libeigen3-dev + sudo apt -y install libboost-dev - name: install python dependencies run: | diff --git a/.vscode/settings.json b/.vscode/settings.json index 6e42dfb..a67d847 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,7 +7,85 @@ "cmake.generator": "Ninja", "cmake.configureOnOpen": false, "cmake.configureSettings": { - //"ENABLE_TESTING": "ON", - //"WARNINGS_AS_ERRORS": "TRUE", + "ENABLE_TESTING": "ON", + "WARNINGS_AS_ERRORS": "TRUE" + }, + "files.associations": { + "iosfwd": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "strstream": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "chrono": "cpp", + "codecvt": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "cfenv": "cpp", + "cinttypes": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "valarray": "cpp", + "variant": "cpp", + "core": "cpp", + "geometry": "cpp" } } diff --git a/CMakeLists.txt b/CMakeLists.txt index 60d9b8b..197cef7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,18 @@ -# This file is part of https://github.com/PalamaraLab/threads which is released under the GPL-3.0 license. -# See accompanying LICENSE and COPYING for copyright notice and full details. +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . cmake_minimum_required(VERSION 3.16) message(STATUS "Using CMake version ${CMAKE_VERSION}") @@ -16,9 +29,7 @@ include(cmake/ProjectSettings.cmake) # Link this 'library' to use the warnings specified in CompilerWarnings.cmake add_library(project_warnings INTERFACE) include(cmake/CompilerWarnings.cmake) - -# FIXME Uncomment to enable warnings -#set_project_warnings(project_warnings) +set_project_warnings(project_warnings) # Sanitiser options if supported by compiler include(cmake/Sanitisers.cmake) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 7ecd6aa..1abd197 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,9 @@ module load git/2.36.0-GCCcore-11.3.0-nodocs module load Python/3.10.4-GCCcore-11.3.0 module load GSL/2.7-GCC-11.3.0 module load Boost/1.79.0-GCC-11.3.0 -module load Eigen/3.4.0-GCCcore-11.3.0 module load pybind11/2.9.2-GCCcore-11.3.0 ``` -NB: Eigen is only used for functionality in TGEN.cpp. We can consider releasing without TGEN to have fewer dependencies. - Fire up and activate a new venv: ``` python -m venv venv diff --git a/cmake/CompilerWarnings.cmake b/cmake/CompilerWarnings.cmake index c2965c8..2adab9c 100644 --- a/cmake/CompilerWarnings.cmake +++ b/cmake/CompilerWarnings.cmake @@ -44,7 +44,9 @@ function(set_project_warnings project_name) -Woverloaded-virtual # warn if you overload (not override) a virtual function -Wpedantic # warn if non-standard C++ is used -Wconversion # warn on type conversions that may lose data - -Wsign-conversion # warn on sign conversions + # TODO replace -Wno-sign-conversion with -Wsign-conversion (ticket #26) + -Wno-sign-conversion # disabled until int/size_t resolved + #-Wsign-conversion # warn on sign conversions -Wnull-dereference # warn if a null dereference is detected -Wdouble-promotion # warn if float is implicit promoted to double -Wformat=2 # warn on security issues around functions that format output (ie printf) diff --git a/setup.py b/setup.py index 1823220..70a9b2e 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,19 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + # Based on https://github.com/pybind/cmake_example import os @@ -46,7 +62,7 @@ def build_extension(self, ext): f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", f"-DPYTHON_EXECUTABLE={sys.executable}", f"-DCMAKE_BUILD_TYPE={cfg}", - "-DWARNINGS_AS_ERRORS=OFF", + "-DWARNINGS_AS_ERRORS=ON", "-DENABLE_TESTING=OFF", "-DBoost_NO_BOOST_CMAKE=ON", # from arni: o/w boost 1.74 gets confused re. mtx "-DMAKE_DOCS=OFF" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5dd6146..451e5c2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,18 @@ -# This file is part of https://github.com/PalamaraLab/threads which is released under the GPL-3.0 license. -# See accompanying LICENSE and COPYING for copyright notice and full details. +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . # Find Boost find_package(Boost REQUIRED) @@ -14,23 +27,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() endif() -# Try to find Eigen3 version 3.4 locally installed -find_package(Eigen3 3.4 QUIET) - -if(NOT Eigen3_FOUND) - include(FetchContent) - message(STATUS "Suitable local version of Eigen3 (>= 3.4) not found, fetching from repository...") - FetchContent_Declare( - Eigen3 - GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git - GIT_TAG 9df21dc8b4b576a7aa5c0094daa8d7e8b8be60f0 # 3.4 branch at 2024-03-15 - GIT_SHALLOW TRUE - ) - FetchContent_MakeAvailable(Eigen3) - set(Eigen3_VERSION "3.4") -endif() -message(STATUS "Found Eigen3 ${Eigen3_VERSION}") - # Threads static library set(threads_arg_src Demography.cpp @@ -77,8 +73,6 @@ target_link_libraries(threads_arg PRIVATE Boost::headers project_warnings - PUBLIC - Eigen3::Eigen ) # Conditionally create python bindings diff --git a/src/Demography.cpp b/src/Demography.cpp index c6ca316..6c1c9c4 100644 --- a/src/Demography.cpp +++ b/src/Demography.cpp @@ -1,20 +1,36 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "Demography.hpp" -#include // for std::sort + +#include #include #include #include -#include // to throw errors +#include #include - Demography::Demography(std::vector _sizes, std::vector _times) : times(_times), sizes(_sizes), std_times(std::vector()) { if (times.size() != sizes.size()) { throw std::runtime_error("Demography times and sizes must have equal length"); } - for (int i = 0; i < times.size(); i++) { - if (times[i] < 0 || sizes[i] <= 0) { + for (std::size_t i = 0; i < times.size(); i++) { + if ((times[i] < 0.0) || (sizes[i] <= 0.0)) { throw std::runtime_error("Demography expects non-negative times and strictly positive sizes"); } @@ -33,9 +49,9 @@ Demography::Demography(std::vector _sizes, std::vector _times) } // Compute times in standard coalescent space - int K = times.size(); + std::size_t K = times.size(); std_times.push_back(0.0); - for (int i = 1; i < K; i++) { + for (std::size_t i = 1; i < K; i++) { double d = (times[i] - times[i - 1]) / sizes[i - 1]; std_times.push_back(std_times[i - 1] + d); } @@ -45,27 +61,28 @@ Demography::Demography(std::vector _sizes, std::vector _times) } double Demography::std_to_gen(const double t) { - if (t < 0) { - throw std::runtime_error("Demography can only convert non-negative times to std"); + if (t < std_times.front()) { + throw std::runtime_error("Demography can only convert times greater than the first entry"); } + // Find the highest i s.t. std_times[i] <= t. - int i = - std::distance(std_times.begin(), std::upper_bound(std_times.begin(), std_times.end(), t)) - 1; + const auto it = std::upper_bound(std_times.begin(), std_times.end(), t); + + // Defensive check as the t < std_times.front check above should mean `it` is never first + if (it == std_times.begin()) { + throw std::runtime_error("Unexpected std_to_gen upper bound finding first element"); + } + + std::size_t i = static_cast(it - std_times.begin() - 1); return times[i] + (t - std_times[i]) * sizes[i]; } -/** - * @brief Compute the expected length of the N-th branch - * - * @param N - * @return double - */ double Demography::expected_branch_length(const int N) { return std_to_gen(2. / N); } std::ostream& operator<<(std::ostream& os, const Demography& d) { - for (int i = 0; i < d.sizes.size(); i++) { + for (std::size_t i = 0; i < d.sizes.size(); i++) { std::cout << d.times[i] << " " << d.sizes[i] << " " << d.std_times[i] << std::endl; } return os; diff --git a/src/Demography.hpp b/src/Demography.hpp index 06efa23..f6249ee 100644 --- a/src/Demography.hpp +++ b/src/Demography.hpp @@ -1,30 +1,45 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_DEMOGRAPHY_HPP #define THREADS_ARG_DEMOGRAPHY_HPP -#include #include +#include -// This class is a wrapper for simple coalescence time queries under a piecewise-constant demography +/// This class is a wrapper for simple coalescence time queries under a piecewise-constant +/// demography class Demography { public: - // These are in generations - std::vector times; - // These are *haploid* - std::vector sizes; - // Normalised coalescence times - std::vector std_times; - // Expected pairwise coalescent time - double expected_time = 0.0; - Demography(std::vector _times, std::vector _sizes); - // Map a time in the standard coalescent to generations under this demography + /// Map a time in the standard coalescent to generations under this demography double std_to_gen(const double t); - // The expected branch length of a new branch in a tree with N leaves + + /// The expected branch length of a new branch in a tree with N leaves double expected_branch_length(const int N); - // Output + /// Stream output friend std::ostream& operator<<(std::ostream& os, const Demography& demography); + +public: + std::vector times; ///< These are in generations + std::vector sizes; ///< These are *haploid* + std::vector std_times; ///< Normalised coalescence times + double expected_time = 0.0; ///< Expected pairwise coalescent time }; #endif // THREADS_ARG_DEMOGRAPHY_HPP diff --git a/src/HMM.cpp b/src/HMM.cpp index bbea44a..dd48636 100644 --- a/src/HMM.cpp +++ b/src/HMM.cpp @@ -1,9 +1,25 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "HMM.hpp" + #include #include #include - HMM::HMM(Demography demography, std::vector bp_sizes, std::vector cm_sizes, double mutation_rate, int K) : num_states(K) { @@ -12,7 +28,8 @@ HMM::HMM(Demography demography, std::vector bp_sizes, std::vector trellis_row(num_states, 0.0); std::vector pointer_row(num_states, 0); trellis.push_back(trellis_row); @@ -21,19 +38,19 @@ HMM::HMM(Demography demography, std::vector bp_sizes, std::vector HMM::compute_expected_times(Demography demography, const int K) { - std::vector expected_times; + std::vector result; double k = static_cast(num_states); boost::math::exponential e; for (int i = 1; i <= K; i++) { double t = demography.std_to_gen(quantile(e, (i - 0.5) / k)); - expected_times.push_back(t); + result.push_back(t); } - return expected_times; + return result; } void HMM::compute_recombination_scores(std::vector cm_sizes) { - for (int i = 0; i < cm_sizes.size(); i++) { + for (std::size_t i = 0; i < cm_sizes.size(); i++) { non_transition_score.push_back(std::vector()); transition_score.push_back(std::vector()); for (int k = 0; k < num_states; k++) { @@ -51,14 +68,14 @@ void HMM::compute_recombination_scores(std::vector cm_sizes) { } void HMM::compute_mutation_scores(std::vector bp_sizes, double mutation_rate) { - for (int i = 0; i < bp_sizes.size(); i++) { + for (std::size_t i = 0; i < bp_sizes.size(); i++) { hom_score.push_back(std::vector()); het_score.push_back(std::vector()); for (int k = 0; k < num_states; k++) { double t = expected_times[k]; + // TODO: use mean-bp sizes here as in the main algorithm const double l = 2. * mutation_rate * bp_sizes[i] * t; - const double trans = std::log1p(-std::exp(-l)); // log-prob of mutating het_score[i].push_back(std::log1p(-std::exp(-l))); @@ -75,17 +92,20 @@ void HMM::compute_mutation_scores(std::vector bp_sizes, double mutation_ std::vector HMM::breakpoints(std::vector observations, int start) { // Viterbi // Initialize - int neighborhood_size = observations.size(); + int neighborhood_size = static_cast(observations.size()); std::vector z(neighborhood_size); - int end = start + neighborhood_size; for (int i = 0; i < num_states; i++) { double score = observations[0] ? het_score[start][i] : hom_score[start][i]; trellis[start][i] = score; } + if (num_states > std::numeric_limits::max()) { + throw std::runtime_error("Unable to store breakpoints for more than 2^16 states"); + } + // Main routine - double score; - unsigned short running_argmax; + double score = 0.0; + unsigned short running_argmax = 0; for (int j = 1; j < neighborhood_size; j++) { for (int i = 0; i < num_states; i++) { double running_max = 0; @@ -98,7 +118,7 @@ std::vector HMM::breakpoints(std::vector observations, int start) { if (score > running_max || k == 0) { running_max = score; - running_argmax = k; + running_argmax = static_cast(k); } } trellis[j + start][i] = running_max; @@ -113,7 +133,7 @@ std::vector HMM::breakpoints(std::vector observations, int start) { double s = trellis[start + neighborhood_size - 1][k]; if (s > running_max) { running_max = s; - argmax = k; + argmax = static_cast(k); } } diff --git a/src/HMM.hpp b/src/HMM.hpp index 379ecba..0ce0201 100644 --- a/src/HMM.hpp +++ b/src/HMM.hpp @@ -1,32 +1,47 @@ -#ifndef DEMOGRAPHY -#define DEMOGRAPHY +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "Demography.hpp" -#endif // DEMOGRAPHY -#include #include +#include - -// This class contains the PSMC-like algorithm used to break up segments for small-N inference +/// This class contains the PSMC-like algorithm used to break up segments for small-N inference class HMM { public: - // HMM data - int num_states = 0; - std::vector expected_times; + HMM(Demography demography, std::vector bp_sizes, std::vector cm_sizes, + double mutation_rate, int K); + HMM() = default; - std::vector> non_transition_score; - std::vector> transition_score; - std::vector> hom_score; - std::vector> het_score; void compute_recombination_scores(std::vector cm_sizes); void compute_mutation_scores(std::vector bp_sizes, double mutation_rate); std::vector compute_expected_times(Demography demography, int K); + std::vector breakpoints(std::vector observations, int start); + +public: + // HMM data + int num_states = 0; + std::vector expected_times; // HMM working quantities std::vector> trellis; std::vector> pointers; - HMM(Demography demography, std::vector bp_sizes, std::vector cm_sizes, double mutation_rate, int K); - HMM() {}; - std::vector breakpoints(std::vector observations, int start); + std::vector> non_transition_score; + std::vector> transition_score; + std::vector> hom_score; + std::vector> het_score; }; diff --git a/src/ImputationMatcher.cpp b/src/ImputationMatcher.cpp index 90cefd4..19e0390 100644 --- a/src/ImputationMatcher.cpp +++ b/src/ImputationMatcher.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "ImputationMatcher.hpp" #include @@ -8,33 +24,37 @@ #include #include +// Uncomment this #define to enable a runtime check that genetic position are in order. This +// diagnostic check is left in whilst we may have issues during development. +// #define IMPUTATION_MATCHER_CHECK_IN_ORDER ImputationMatcher::ImputationMatcher(int _n_ref, int _n_target, const std::vector& _genetic_positions, double _query_interval_size, int _neighborhood_size) - : num_reference(_n_ref), num_target(_n_target), genetic_positions(_genetic_positions), - query_interval_size(_query_interval_size), neighborhood_size(_neighborhood_size) { + : neighborhood_size(_neighborhood_size), num_reference(_n_ref), num_target(_n_target), + query_interval_size(_query_interval_size), genetic_positions(_genetic_positions) { if (genetic_positions.size() <= 2) { throw std::runtime_error("Need at least 3 sites, found " + std::to_string(genetic_positions.size())); } - num_sites = genetic_positions.size(); + num_sites = static_cast(genetic_positions.size()); num_samples = num_reference + num_target; sites_processed = 0; - // Check maps are strictly increasing - // for (int i = 0; i < num_sites - 1; i++) { - // if (genetic_positions.at(i + 1) <= genetic_positions.at(i)) { - // std::string prompt = "Genetic coordinates must be strictly increasing, found "; - // prompt += std::to_string(genetic_positions[i + 1]) + " after " + - // std::to_string(genetic_positions[i]); - // throw std::runtime_error(prompt); - // } - // } +#ifdef IMPUTATION_MATCHER_CHECK_IN_ORDER + for (int i = 0; i < num_sites - 1; i++) { + if (genetic_positions.at(i + 1) <= genetic_positions.at(i)) { + std::string prompt = "Genetic coordinates must be strictly increasing, found "; + prompt += std::to_string(genetic_positions[i + 1]) + " after " + + std::to_string(genetic_positions[i]); + throw std::runtime_error(prompt); + } + } +#endif // IMPUTATION_MATCHER_CHECK_IN_ORDER int query_site_idx = 1; double gen_pos_offset = genetic_positions[0]; - for (int i = 0; i < genetic_positions.size(); i++) { + for (int i = 0; i < static_cast(genetic_positions.size()); i++) { double cm = genetic_positions.at(i); if (cm > gen_pos_offset + query_interval_size * query_site_idx) { query_sites.push_back(i); @@ -82,11 +102,11 @@ void ImputationMatcher::process_site(const std::vector& genotype) { } } - if (genotype.size() != num_samples) { + if (static_cast(genotype.size()) != num_samples) { throw std::runtime_error("invalid genotype vector size"); } - if (next_query_site_idx < query_sites.size() && + if (next_query_site_idx < static_cast(query_sites.size()) && sites_processed == query_sites.at(next_query_site_idx)) { next_query_site_idx++; int ref_allele_count = 0; @@ -175,10 +195,10 @@ void ImputationMatcher::process_site(const std::vector& genotype) { sites_processed++; } -std::unordered_map> ImputationMatcher::get_matches() { +const std::unordered_map>& ImputationMatcher::get_matches() const { return match_sets; } -std::vector ImputationMatcher::get_sorting() { +const std::vector& ImputationMatcher::get_sorting() const { return sorting; } diff --git a/src/ImputationMatcher.hpp b/src/ImputationMatcher.hpp index d2202f7..8479db8 100644 --- a/src/ImputationMatcher.hpp +++ b/src/ImputationMatcher.hpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_IMPUTATION_MATCHER_HPP #define THREADS_ARG_IMPUTATION_MATCHER_HPP @@ -5,23 +21,19 @@ #include #include -// NB this recycles a lot of code from Matcher.hpp +/// This is a version of the Threads haplotype matching algorithm +/// adapted to be used for imputation. +/// NB this recycles a lot of code from Matcher.hpp class ImputationMatcher { +public: + ImputationMatcher(int _n_ref, int _n_target, const std::vector& _genetic_positions, + double _query_interval_size, int _neighborhood_size); -private: - int sites_processed = 0; - int next_query_site_idx = 0; - - // pbwt quantities - std::vector sorting; - std::vector next_sorting; - - // querying quantities - std::vector ref_sorting; + void process_site(const std::vector& genotype); + const std::unordered_map>& get_matches() const; + const std::vector& get_sorting() const; public: - // This is a version of the Threads haplotype matching algorithm - // adapted to be used for imputation. // TODO: include a second pass through data here to get divergence values and not do that using // Threads-fastLS int neighborhood_size = 0; @@ -33,14 +45,17 @@ class ImputationMatcher { double query_interval_size = 0.0; std::unordered_map> match_sets; std::vector genetic_positions; - ImputationMatcher(int _n_ref, int _n_target, const std::vector& _genetic_positions, - double _query_interval_size, int _neighborhood_size); - // Do all the work - void process_site(const std::vector& genotype); - std::unordered_map> get_matches(); +private: + int sites_processed = 0; + int next_query_site_idx = 0; - std::vector get_sorting(); + // pbwt quantities + std::vector sorting; + std::vector next_sorting; + + // querying quantities + std::vector ref_sorting; }; #endif // THREADS_ARG_IMPUTATION_MATCHER_HPP diff --git a/src/Matcher.cpp b/src/Matcher.cpp index 733118e..a95ac05 100644 --- a/src/Matcher.cpp +++ b/src/Matcher.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "Matcher.hpp" #include @@ -8,7 +24,6 @@ #include #include - // For a given interval, this contains all the matches for all the samples MatchGroup::MatchGroup(int _num_samples, double _cm_position) : num_samples(_num_samples), cm_position(_cm_position) { @@ -24,7 +39,7 @@ MatchGroup::MatchGroup(const std::vector& target_ids, if (target_ids.size() != matches.size()) { throw std::runtime_error("Inconsistent target/matches sizes"); } - for (int i = 0; i < target_ids.size(); i++) { + for (int i = 0; i < static_cast(target_ids.size()); i++) { match_candidates[target_ids[i]] = matches[i]; } } @@ -78,7 +93,7 @@ void MatchGroup::filter_matches(int min_matches) { // Then determine top 4 candidates for neighboring groups top_four_maps.reserve(num_samples); for (int i = 0; i < num_samples; i++) { - top_four_maps.emplace_back(std::min(4, (int) match_candidates.at(i).size())); + top_four_maps.emplace_back(std::min(4, static_cast(match_candidates.at(i).size()))); std::partial_sort_copy(match_candidates_counts.at(i).begin(), match_candidates_counts.at(i).end(), top_four_maps.at(i).begin(), top_four_maps.at(i).end(), @@ -107,12 +122,13 @@ Matcher::Matcher(int _n, const std::vector& _genetic_positions, double _ double _match_group_interval_size, int _neighborhood_size, int _min_matches) : num_samples(_n), genetic_positions(_genetic_positions), query_interval_size(_query_interval_size), - match_group_interval_size(_match_group_interval_size), neighborhood_size(_neighborhood_size), min_matches(_min_matches) { + match_group_interval_size(_match_group_interval_size), neighborhood_size(_neighborhood_size), + min_matches(_min_matches) { if (genetic_positions.size() <= 2) { throw std::runtime_error("Need at least 3 sites, found " + std::to_string(genetic_positions.size())); } - num_sites = genetic_positions.size(); + num_sites = static_cast(genetic_positions.size()); sites_processed = 0; // Check maps are strictly increasing @@ -130,7 +146,7 @@ Matcher::Matcher(int _n, const std::vector& _genetic_positions, double _ int match_group_site_idx = 1; double gen_pos_offset = genetic_positions[0]; match_group_sites = {0}; - for (int i = 0; i < genetic_positions.size(); i++) { + for (int i = 0; i < static_cast(genetic_positions.size()); i++) { double cm = genetic_positions.at(i); if (cm > gen_pos_offset + query_interval_size * query_site_idx) { query_sites.push_back(i); @@ -162,7 +178,7 @@ Matcher::Matcher(int _n, const std::vector& _genetic_positions, double _ } match_group_idx = 0; std::cout << "Will use " << query_sites.size() << " query sites and " << match_group_sites.size() - << " match_group_sites" << std::endl; + << " match_group_sites" << std::endl; match_groups.reserve(match_group_sites.size()); for (int match_group_site : match_group_sites) { @@ -195,7 +211,7 @@ void Matcher::process_site(const std::vector& genotype) { } int counter0 = 0; int counter1 = 0; - if (genotype.size() != num_samples) { + if (static_cast(genotype.size()) != num_samples) { throw std::runtime_error("invalid genotype vector size"); } @@ -217,14 +233,14 @@ void Matcher::process_site(const std::vector& genotype) { sorting = next_sorting; // Threading-neighbor queries - if (match_group_idx < match_group_sites.size() - 1 && - sites_processed >= match_group_sites.at(match_group_idx + 1)) { + if (match_group_idx < (static_cast(match_group_sites.size()) - 1) && + (sites_processed >= match_group_sites.at(match_group_idx + 1))) { match_group_idx++; match_groups.at(match_group_idx - 1).filter_matches(min_matches); } // If we've reached a query site, query - if (next_query_site_idx < query_sites.size() && + if (next_query_site_idx < static_cast(query_sites.size()) && sites_processed == query_sites.at(next_query_site_idx)) { // Get the arg-sort of the sorting for (int i = 0; i < num_samples; i++) { @@ -238,18 +254,18 @@ void Matcher::process_site(const std::vector& genotype) { // Insert sequences and query in order for (int i = 1; i < num_samples; i++) { std::vector matches; - int allele = genotype.at(i); matches.reserve(neighborhood_size); auto iter = threaded.insert(permutation.at(i)); auto iter_up = iter.first; auto iter_down = iter.first; // Check if genotypes are identical, just to be sure - while (matches.size() < neighborhood_size && (iter_down != threaded.begin() || iter_up != threaded.end())) { + while ((static_cast(matches.size()) < neighborhood_size) && + (iter_down != threaded.begin() || iter_up != threaded.end())) { if (iter_down != threaded.begin()) { iter_down--; matches.push_back(sorting.at(*iter_down)); } - if (matches.size() < neighborhood_size && iter_up != threaded.end()) { + if (static_cast(matches.size()) < neighborhood_size && iter_up != threaded.end()) { iter_up++; if (iter_up != threaded.end()) { matches.push_back(sorting.at(*iter_up)); @@ -273,7 +289,7 @@ void Matcher::process_site(const std::vector& genotype) { } // Special case for last query - if (next_query_site_idx == query_sites.size()) { + if (next_query_site_idx == static_cast(query_sites.size())) { match_groups.at(match_group_sites.size() - 1).filter_matches(min_matches); } } @@ -282,7 +298,7 @@ void Matcher::process_site(const std::vector& genotype) { // Propagate top 4 matches from left and right match groups void Matcher::propagate_adjacent_matches() { - for (int i = 1; i < match_groups.size(); i++) { + for (int i = 1; i < static_cast(match_groups.size()); i++) { MatchGroup& group = match_groups.at(i); MatchGroup& prev = match_groups.at(i - 1); group.insert_tops_from(prev); diff --git a/src/Matcher.hpp b/src/Matcher.hpp index ff468dd..fbe5a41 100644 --- a/src/Matcher.hpp +++ b/src/Matcher.hpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_MATCHER_HPP #define THREADS_ARG_MATCHER_HPP @@ -5,45 +21,26 @@ #include #include -// for a certain interval, store the matches for all samples +/// for a certain interval, store the matches for all samples class MatchGroup { public: - int num_samples; - std::unordered_map> match_candidates; - std::vector> match_candidates_counts; - std::vector>> top_four_maps; - double cm_position; MatchGroup(int _num_samples, double cm_position); MatchGroup(const std::vector& target_ids, const std::vector>& matches, const double _cm_position); void filter_matches(int min_matches); void insert_tops_from(MatchGroup& other); void clear(); + +public: + int num_samples = 0; + std::unordered_map> match_candidates; + std::vector> match_candidates_counts; + std::vector>> top_four_maps; + double cm_position = 0.0; }; class Matcher { - -private: - int min_matches = 0; - int sites_processed = 0; - int next_query_site_idx = 0; - int match_group_idx = 0; - int min_match_length = 1; - std::vector match_groups; - std::vector sorting; - std::vector next_sorting; - std::vector permutation; - public: - int neighborhood_size = 0; - std::vector query_sites; - std::vector match_group_sites; - int num_samples = 0; - int num_sites = 0; - double query_interval_size = 0.0; - // matches in these groups are considered together in the hmm - double match_group_interval_size = 0.0; - std::vector genetic_positions; Matcher(int _n, const std::vector& _genetic_positions, double _query_interval_size, double _match_group_interval_size, int _neighborhood_size, int _min_matches); @@ -51,6 +48,7 @@ class Matcher { void process_site(const std::vector& genotype); void propagate_adjacent_matches(); void clear(); + std::vector get_matches(); std::vector>> serializable_matches(std::vector& target_ids); @@ -58,6 +56,27 @@ class Matcher { std::vector get_sorting(); std::vector get_permutation(); + +public: + int num_samples = 0; + std::vector genetic_positions; + double query_interval_size = 0.0; + double match_group_interval_size = 0.0; + int neighborhood_size = 0; + std::vector query_sites; + std::vector match_group_sites; + int num_sites = 0; + // matches in these groups are considered together in the hmm + +private: + int min_matches = 0; + int sites_processed = 0; + int next_query_site_idx = 0; + int match_group_idx = 0; + std::vector match_groups; + std::vector sorting; + std::vector next_sorting; + std::vector permutation; }; #endif // THREADS_ARG_MATCHER_HPP diff --git a/src/Node.cpp b/src/Node.cpp index 4eb6faa..12bf8a9 100644 --- a/src/Node.cpp +++ b/src/Node.cpp @@ -1,8 +1,23 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "Node.hpp" #include - Node::Node(int _sample_ID, int _divergence, bool _genotype) : sample_ID(_sample_ID), divergence(_divergence), genotype(_genotype) { } diff --git a/src/Node.hpp b/src/Node.hpp index 1b228ef..1c07075 100644 --- a/src/Node.hpp +++ b/src/Node.hpp @@ -1,11 +1,36 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_NODE_HPP #define THREADS_ARG_NODE_HPP #include #include - class Node { +public: + // Constructors + Node(int sample_ID, int divergence, bool genotype); + + // Node movers-arounders + void insert_above(Node* node); + + // Output + friend std::ostream& operator<<(std::ostream& os, const Node& node); + public: // Node data int sample_ID = 0; @@ -18,15 +43,6 @@ class Node { // "Next below to the right" for 0 and 1 std::array w = {nullptr, nullptr}; - - // Constructors - Node(int sample_ID, int divergence, bool genotype); - - // Node movers-arounders - void insert_above(Node* node); - - // Output - friend std::ostream& operator<<(std::ostream& os, const Node& node); }; #endif // THREADS_ARG_NODE_HPP \ No newline at end of file diff --git a/src/State.cpp b/src/State.cpp index 6c8125d..b726adb 100644 --- a/src/State.cpp +++ b/src/State.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "State.hpp" #include @@ -5,7 +21,6 @@ #include #include - TracebackState::TracebackState(int _site, Node* _best_prev_node, TracebackState* _prev) : site(_site), best_prev_node(_best_prev_node), prev(_prev) { } @@ -14,7 +29,7 @@ State::State(Node* _below, double _score, TracebackState* _traceback) : below(_below), score(_score), traceback(_traceback) { } -bool State::genotype() { +bool State::genotype() const { return this->below->above->genotype; } @@ -55,7 +70,7 @@ void StateBranch::prune() { }); std::vector new_states; - double running_min_score = std::numeric_limits::infinity(); + double running_min_score = std::numeric_limits::max(); for (const State& s : states) { if (s.score < running_min_score) { new_states.push_back(s); @@ -81,7 +96,7 @@ void StateTree::prune() { } } -std::vector StateTree::dump() { +std::vector StateTree::dump() const { std::vector states; for (auto pair : branches) { for (auto s : pair.second.states) { diff --git a/src/State.hpp b/src/State.hpp index 9ad796b..fcc589f 100644 --- a/src/State.hpp +++ b/src/State.hpp @@ -1,69 +1,85 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_STATE_HPP #define THREADS_ARG_STATE_HPP #include "Node.hpp" -#include #include +#include class TracebackState { +public: + TracebackState(int _site, Node* _best_prev_node, TracebackState* _prev); + public: int site = 0; - // ID to trace back through to last traceback state - Node* best_prev_node = nullptr; + Node* best_prev_node = nullptr; ///< ID to trace back through to last traceback state TracebackState* prev = nullptr; - - TracebackState(int _site, Node* _best_prev_node, TracebackState* _prev); }; class State { public: - // Nothing here can be const because we want to use std::sort later on - // Panel entry directly below - Node* below = nullptr; - // Score of the current state - double score = 0.0; - // Pointer to last recombinant state - TracebackState* traceback = nullptr; - // Shorthand for this.below->site - bool genotype(); - State(Node* _below, double _score, TracebackState* _traceback); + /// Shorthand for this.below->site + bool genotype() const; + friend std::ostream& operator<<(std::ostream& os, State& state); + +public: + // Nothing here can be const because we want to use std::sort later on + Node* below = nullptr; ///< Panel entry directly below + double score = 0.0; ///< Score of the current state + TracebackState* traceback = nullptr; }; class StatePair { public: - // Panel entry directly below - Node* below_a = nullptr; - Node* below_b = nullptr; - // Score of the current state - double score = 0.0; - // Pointer to last recombinant state - TracebackState* traceback_a = nullptr; - TracebackState* traceback_b = nullptr; - StatePair(Node* _below_a, Node* _below_b, double _score, TracebackState* _traceback_a, TracebackState* _traceback_b); friend std::ostream& operator<<(std::ostream& os, StatePair& state_pair); + +public: + Node* below_a = nullptr; ///< Panel entry directly below + Node* below_b = nullptr; + double score = 0.0; ///< Score of the current state + TracebackState* traceback_a = nullptr; ///< Pointer to last recombinant state + TracebackState* traceback_b = nullptr; }; class StateBranch { public: - std::vector states; void insert(const State& state); void prune(); + +public: + std::vector states; }; class StateTree { public: - std::unordered_map branches; - StateTree(std::vector& states); void prune(); - std::vector dump(); + std::vector dump() const; + +public: + std::unordered_map branches; }; #endif // THREADS_ARG_STATE_HPP diff --git a/src/TGEN.cpp b/src/TGEN.cpp index f5f6d08..fdee447 100644 --- a/src/TGEN.cpp +++ b/src/TGEN.cpp @@ -1,8 +1,23 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "TGEN.hpp" #include "TgenSegment.hpp" -#include #include #include @@ -10,7 +25,6 @@ #include #include - namespace boost::icl { // See // https://www.boost.org/doc/libs/1_81_0/libs/icl/doc/html/boost_icl/examples/custom_interval.html @@ -52,15 +66,12 @@ class TGENImpl { std::vector interval_sets; std::unordered_map pos_idx_map; - Eigen::VectorXi reference_genome; std::vector reference_genome_vec; std::vector> bp_starts; std::vector> target_IDs; std::vector> het_sites; std::vector positions; std::vector> genotypes; - - Eigen::MatrixXi genotype_cache; std::unordered_map cached_genotypes_map; // Constructor @@ -70,27 +81,23 @@ class TGENImpl { het_sites(std::move(_het_sites)), positions(std::move(_positions)) { positions.push_back(std::numeric_limits::max()); // position-to-site map - for (int i = 0; i < positions.size(); i++) { + for (int i = 0; i < static_cast(positions.size()); i++) { pos_idx_map[positions[i]] = i; } - // set reference genome here - reference_genome = Eigen::VectorXi::Zero(positions.size()); reference_genome_vec = std::vector(positions.size(), false); for (int h : het_sites[0]) { - reference_genome[pos_idx_map.at(h)] = 1; reference_genome_vec[pos_idx_map.at(h)] = true; } interval_sets.reserve(bp_starts.size()); // Initialize interval maps for each sample. This can get too slow - for (int i = 0; i < bp_starts.size(); i++) { + for (std::size_t i = 0; i < bp_starts.size(); i++) { SegmentSet iset; - int n_segs = bp_starts[i].size(); + int n_segs = static_cast(bp_starts[i].size()); std::vector& sample_hets = het_sites[i]; - int n_hets = sample_hets.size(); + int n_hets = static_cast(sample_hets.size()); int het_site_idx = 0; - int pos_idx = 0; std::vector seg_hets; for (int j = 0; j < n_segs; j++) { int seg_start = bp_starts[i][j]; @@ -116,84 +123,7 @@ class TGENImpl { cached_genotypes_map[0] = -1; } - // Eigen-based query -// Eigen::MatrixXi& query(const int bp_from, const int bp_to, const std::vector& samples) { -// clear_cache(); - -// Deprecated eigen-based query -// Eigen::MatrixXi& TGEN::query(const int bp_from, const int bp_to, const std::vector& samples) { -// clear_cache(); - -// // Find number of expected sites -// int start_pos = *std::lower_bound(positions.begin(), positions.end(), bp_from); -// int end_pos = *std::upper_bound(positions.begin(), positions.end(), bp_to); -// int idx_offset = pos_idx_map[start_pos]; -// genotype_cache.resize(samples.size(), pos_idx_map[end_pos] - idx_offset); - -// TgenSegment range(start_pos, end_pos); - -// for (int i = 0; i < samples.size(); i++) { -// cached_genotypes_map[samples[i]] = i; -// if (samples[i] == 0) { -// auto insert_range = -// Eigen::seq(0, pos_idx_map[end_pos] - idx_offset - 1); // eigen seq is inclusive -// auto copy_range = Eigen::seq(idx_offset, pos_idx_map[end_pos] - 1); -// genotype_cache(i, insert_range) = reference_genome( -// copy_range); //.eval(); //WARNING need .eval() here (or do we? I don't think we do) -// } -// else { -// SegmentSet& segments(interval_sets[samples[i]]); - -// // Initialize the queue -// std::queue seg_queue; -// auto eqr = segments.equal_range(range); -// for (SegmentSet::const_iterator iter = eqr.first; iter != eqr.second; iter++) { -// seg_queue.push(*iter & range); -// } - -// // Process everything in the queue -// while (!seg_queue.empty()) { -// TgenSegment& segment = seg_queue.front(); -// if (cached_genotypes_map.find(segment.target) != cached_genotypes_map.end()) { -// // We've reached somewhere along the tree where we can copy from -// int seg_start_idx = pos_idx_map[segment.lower()]; -// int seg_end_idx = pos_idx_map[segment.upper()]; -// auto insert_range = Eigen::seq( -// seg_start_idx - idx_offset, seg_end_idx - idx_offset - 1); // eigen seq is inclusive -// if (segment.target == 0) { -// auto copy_range = Eigen::seq(seg_start_idx, seg_end_idx - 1); -// // We've reached the root of the tree and copy from the "reference" genome -// genotype_cache(i, insert_range) = reference_genome(copy_range); -// } -// else { -// // We've found a cached genotype to copy from - -// genotype_cache(i, insert_range) = -// genotype_cache(cached_genotypes_map[segment.target], insert_range); -// } - -// // We then flip all the het sites -// for (int h : segment.het_sites) { -// genotype_cache(i, pos_idx_map[h] - idx_offset) = -// 1 - genotype_cache(i, pos_idx_map[h] - idx_offset); -// } -// } -// else { -// // We've not yet reached somewhere to copy from, so we keep traversing -// auto new_eqr = interval_sets[segment.target].equal_range(segment); -// for (SegmentSet::const_iterator iter = new_eqr.first; iter != new_eqr.second; iter++) { -// seg_queue.push(*iter & segment); -// } -// } -// seg_queue.pop(); -// } -// } -// } -// return genotype_cache; -// } - -// std::vector-based query -// Warning: This makes a copy when returned through the python interface + // Warning: This makes a copy when returned through the python interface std::vector>& query(const int bp_from, const int bp_to, const std::vector& samples) { genotypes.clear(); @@ -203,15 +133,14 @@ class TGENImpl { int end_pos = *std::upper_bound(positions.begin(), positions.end(), bp_to); int idx_offset = pos_idx_map[start_pos]; - int n_samples = samples.size(); int n_sites = pos_idx_map[end_pos] - idx_offset; genotypes.reserve(samples.size()); - for (int i = 0; i < samples.size(); i++) { + for (std::size_t i = 0; i < samples.size(); i++) { genotypes.push_back(std::vector(n_sites)); } TgenSegment range(start_pos, end_pos); - for (int i = 0; i < samples.size(); i++) { + for (int i = 0; i < static_cast(samples.size()); i++) { std::vector& current_gt = genotypes.at(i); cached_genotypes_map[samples[i]] = i; if (samples[i] == 0) { @@ -225,7 +154,7 @@ class TGENImpl { std::queue seg_queue; auto eqr = segments.equal_range(range); for (SegmentSet::const_iterator iter = eqr.first; iter != eqr.second; iter++) { - seg_queue.push(*iter & range); + seg_queue.push(iter->calc_intersection_with(range)); } // Process everything in the queue @@ -259,7 +188,7 @@ class TGENImpl { // We've not yet reached somewhere to copy from, so we keep traversing auto new_eqr = interval_sets[segment.target].equal_range(segment); for (SegmentSet::const_iterator iter = new_eqr.first; iter != new_eqr.second; iter++) { - seg_queue.push(*iter & segment); + seg_queue.push(iter->calc_intersection_with(segment)); } } seg_queue.pop(); @@ -271,7 +200,6 @@ class TGENImpl { void clear_cache() { cached_genotypes_map.clear(); - genotype_cache.resize(0, 0); cached_genotypes_map[0] = -1; } }; @@ -279,11 +207,15 @@ class TGENImpl { TGEN::TGEN(std::vector _positions, std::vector> _bp_starts, std::vector> _target_IDs, std::vector> _het_sites) : pimpl(std::make_unique(std::move(_positions), std::move(_bp_starts), - std::move(_target_IDs), std::move(_het_sites))) {} + std::move(_target_IDs), std::move(_het_sites))) { +} -TGEN::~TGEN() = default; +TGEN::~TGEN() { + // Empty destructor is declared in source rather than header so pimpl destructor is accessible +} -std::vector>& TGEN::query(int start_pos, int end_pos, const std::vector& samples) { +std::vector>& TGEN::query(int start_pos, int end_pos, + const std::vector& samples) { // Forward the call to pimpl return pimpl->query(start_pos, end_pos, samples); } diff --git a/src/TGEN.hpp b/src/TGEN.hpp index 1a886d1..79b9c2a 100644 --- a/src/TGEN.hpp +++ b/src/TGEN.hpp @@ -1,32 +1,42 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_TGEN_HPP #define THREADS_ARG_TGEN_HPP -#include #include #include #include - -class TGENImpl; // Forward declaration of the implementation class +// Forward declaration of the implementation class // Using PIMPL idiom to prevent libraries that #include "TGEN.hpp" needing to see boost/icl headers +class TGENImpl; class TGEN { - -private: - std::unique_ptr pimpl; // Pointer to implementation - public: - // Constructors TGEN(std::vector _positions, std::vector> _bp_starts, std::vector> _target_IDs, std::vector> _het_sites); + ~TGEN(); - ~TGEN(); // Define the destructor for proper deletion of pimpl - - std::vector>& query(int start_pos, int end_pos, const std::vector& samples); - - // Deprecated Eigen version: - // Eigen::MatrixXi& query(const int start_pos, const int end_pos, const std::vector& samples); + std::vector>& query(int start_pos, int end_pos, + const std::vector& samples); void clear_cache(); + +private: + std::unique_ptr pimpl; ///< Pointer to implementation }; #endif // THREADS_ARG_TGEN_HPP diff --git a/src/TgenSegment.hpp b/src/TgenSegment.hpp index 8c18d20..1a89c84 100644 --- a/src/TgenSegment.hpp +++ b/src/TgenSegment.hpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_TGEN_SEGMENT_HPP #define THREADS_ARG_TGEN_SEGMENT_HPP @@ -9,25 +25,22 @@ // Here is a typical class that may model intervals in your application. class TgenSegment { public: - TgenSegment() : _first(), _past(), target(std::numeric_limits::max()) { - } - TgenSegment(const TgenSegment& other) - : _first(other._first), _past(other._past), het_sites(other.het_sites), target(other.target) { + TgenSegment() : target(std::numeric_limits::max()), first(), past() { } - TgenSegment(int lo, int up) : _first(lo), _past(up), target(std::numeric_limits::max()) { + + TgenSegment(int lo, int up) : target(std::numeric_limits::max()), first(lo), past(up) { } + TgenSegment(int lo, int up, std::vector _hets, int _target) - : _first(lo), _past(up), het_sites(_hets), target(_target) { + : het_sites(_hets), target(_target), first(lo), past(up) { } - std::vector het_sites; - int target = std::numeric_limits::max(); - [[nodiscard]] int lower() const { - return _first; + return first; } + [[nodiscard]] int upper() const { - return _past; + return past; } friend std::ostream& operator<<(std::ostream& os, const TgenSegment& seg) { @@ -40,7 +53,7 @@ class TgenSegment { return os; } - TgenSegment operator&(const TgenSegment& other) const { + TgenSegment calc_intersection_with(const TgenSegment& other) const { int new_first = std::max(lower(), other.lower()); int new_past = std::min(upper(), other.upper()); @@ -53,9 +66,13 @@ class TgenSegment { return {new_first, new_past, merged_sites, std::min(target, other.target)}; } +public: + std::vector het_sites; + int target = std::numeric_limits::max(); + private: - int _first = 0; - int _past = 0; + int first = 0; + int past = 0; }; #endif // THREADS_ARG_TGEN_SEGMENT_HPP diff --git a/src/ThreadsFastLS.cpp b/src/ThreadsFastLS.cpp index c9a1016..1f81f5e 100644 --- a/src/ThreadsFastLS.cpp +++ b/src/ThreadsFastLS.cpp @@ -1,4 +1,21 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "ThreadsFastLS.hpp" + #include #include #include @@ -13,18 +30,31 @@ #include #include +// Uncomment this #define to enable a runtime check that genetic position are in order. This +// diagnostic check is left in whilst we may have issues during development. +// #define THREADS_FAST_LS_CHECK_IN_ORDER + +namespace { + +const int END_ALLELE = 0; +const int HMM_SPLIT_THRESHOLD = 1000; -int END_ALLELE = 0; +inline std::size_t pair_key(int i, int j) { + return (static_cast(i) << 32) | static_cast(j); +} -ThreadsFastLS::ThreadsFastLS(std::vector _physical_positions, std::vector _genetic_positions, - double _mutation_rate, std::vector ne, std::vector ne_times, - bool _sparse_sites, - int _n_prune, // Threshold for pruning states in the tdpbwt algorithm - bool _use_hmm, int _burn_in_left, int _burn_in_right) - : physical_positions(_physical_positions), genetic_positions(_genetic_positions), - mutation_rate(_mutation_rate), demography(Demography(ne, ne_times)), - sparse_sites(_sparse_sites), n_prune(_n_prune), use_hmm(_use_hmm), - burn_in_left(_burn_in_left), burn_in_right(_burn_in_right) { +} // namespace + +ThreadsFastLS::ThreadsFastLS(std::vector _physical_positions, + std::vector _genetic_positions, double _mutation_rate, + std::vector ne, std::vector ne_times, + bool _sparse_sites, + int _n_prune, // Threshold for pruning states in the tdpbwt algorithm + bool _use_hmm, int _burn_in_left, int _burn_in_right) + : n_prune(_n_prune), mutation_rate(_mutation_rate), burn_in_left(_burn_in_left), + burn_in_right(_burn_in_right), sparse_sites(_sparse_sites), use_hmm(_use_hmm), + physical_positions(_physical_positions), genetic_positions(_genetic_positions), + demography(Demography(ne, ne_times)) { if (physical_positions.size() != genetic_positions.size()) { std::cerr << "Map lengths don't match.\n"; exit(1); @@ -37,26 +67,26 @@ ThreadsFastLS::ThreadsFastLS(std::vector _physical_positions, std::vecto std::cerr << "Need a strictly positive mutation rate.\n"; exit(1); } - num_sites = physical_positions.size(); + num_sites = static_cast(physical_positions.size()); num_samples = 0; - // Check maps are strictly increasing - // for (int i = 0; i < num_sites - 1; i++) { - // if (physical_positions[i + 1] <= physical_positions[i]) { - // cerr << "Physical positions must be strictly increasing, found "; - // cerr << physical_positions[i + 1] << " after " << physical_positions[i] << endl; - // exit(1); - // } - // if (genetic_positions[i + 1] <= genetic_positions[i]) { - // cerr << "Genetic coordinates must be strictly increasing, found "; - // cerr << genetic_positions[i + 1] << " after " << genetic_positions[i] << endl; - // exit(1); - // } - // } +#ifdef THREADS_FAST_LS_CHECK_IN_ORDER + for (int i = 0; i < num_sites - 1; i++) { + if (physical_positions[i + 1] <= physical_positions[i]) { + cerr << "Physical positions must be strictly increasing, found "; + cerr << physical_positions[i + 1] << " after " << physical_positions[i] << endl; + exit(1); + } + if (genetic_positions[i + 1] <= genetic_positions[i]) { + cerr << "Genetic coordinates must be strictly increasing, found "; + cerr << genetic_positions[i + 1] << " after " << genetic_positions[i] << endl; + exit(1); + } + } +#endif // THREADS_FAST_LS_CHECK_IN_ORDER // Initialize map burn-in threading_start = physical_positions.front() + burn_in_left; - // threading_end = physical_positions.back() - burn_in_right + 1; threading_end = physical_positions.back() - burn_in_right; trim_pos_start_idx = 0; for (int i = 0; i < num_sites; i++) { @@ -78,7 +108,8 @@ ThreadsFastLS::ThreadsFastLS(std::vector _physical_positions, std::vecto } } if (trim_pos_start_idx >= trim_pos_end_idx - 3) { - std::cerr << "Too few positions left after applying burn-in, need at least 3. Aborting." << std::endl; + std::cerr << "Too few positions left after applying burn-in, need at least 3. Aborting." + << std::endl; exit(1); } @@ -114,15 +145,15 @@ ThreadsFastLS::ThreadsFastLS(std::vector _physical_positions, std::vecto std::tuple, std::vector> ThreadsFastLS::site_sizes(std::vector positions) { // Find mid-points between sites - int M = positions.size(); + std::size_t M = positions.size(); std::vector pos_means(M - 1); - for (int i = 0; i < M - 1; i++) { + for (std::size_t i = 0; i < M - 1; i++) { pos_means[i] = (positions[i] + positions[i + 1]) / 2.; } // Find the mean size of mid-point differences std::vector site_sizes(M); // Mid-point deltas tell us about the area around each site - for (int i = 1; i < M - 1; i++) { + for (std::size_t i = 1; i < M - 1; i++) { site_sizes[i] = (pos_means[i] - pos_means[i - 1]); } double mean_size = (pos_means[M - 2] - pos_means[0]) / double(M - 2); @@ -137,16 +168,13 @@ ThreadsFastLS::site_sizes(std::vector positions) { std::vector boundaries(M + 1); boundaries[0] = positions[0]; boundaries[M] = positions[M - 1]; - for (int i = 1; i < M; i++) { + for (std::size_t i = 1; i < M; i++) { boundaries[i] = pos_means[i - 1]; } return std::tuple(boundaries, site_sizes); } -/** - * - */ -std::vector ThreadsFastLS::trimmed_positions() { +std::vector ThreadsFastLS::trimmed_positions() const { std::vector trim_pos = {physical_positions.cbegin() + trim_pos_start_idx, physical_positions.cbegin() + trim_pos_end_idx}; return trim_pos; @@ -159,14 +187,6 @@ void ThreadsFastLS::delete_hmm() { } } -/** - * @brief Find the next insert position - * - * @param t Pointer to node below the sequence being inserted at site i - * @param g Allele at site i+1 - * @param i The site - * @return Node* Pointer to node below the sequence being inserted at site i+1 - */ Node* ThreadsFastLS::extend_node(Node* t, bool g, int i) { Node* t_next; if (!g && t->w[g]->sample_ID == -1) { @@ -183,18 +203,13 @@ Node* ThreadsFastLS::extend_node(Node* t, bool g, int i) { void ThreadsFastLS::insert(const std::vector& genotype) { insert(num_samples, genotype); } -/** - * @brief Insert a new sequence into the dynamic panel - * - * @param genotype - */ void ThreadsFastLS::insert(const int ID, const std::vector& genotype) { if (ID_map.find(ID) != ID_map.end()) { std::cerr << "ID " << ID << " is already in the panel.\n"; exit(1); } - if (genotype.size() != num_sites) { + if (static_cast(genotype.size()) != num_sites) { std::cerr << "Number of input markers does not match map.\n"; exit(1); } @@ -207,7 +222,7 @@ void ThreadsFastLS::insert(const int ID, const std::vector& genotype) { panel.emplace_back(std::vector>(num_sites + 1)); Node* t0 = bottoms[0].get(); - panel[ID_map.at(ID)][0] = std::move(std::make_unique(ID, 0, genotype[0])); + panel[ID_map.at(ID)][0] = std::make_unique(ID, 0, genotype[0]); Node* z0 = panel[ID_map.at(ID)][0].get(); // Inserts z0 above t0 @@ -223,7 +238,7 @@ void ThreadsFastLS::insert(const int ID, const std::vector& genotype) { bool g_k = genotype[k]; bool next_genotype = (k == num_sites - 1) ? END_ALLELE : genotype[k + 1]; // Add current thingy to panel - panel[ID_map.at(ID)][k + 1] = std::move(std::make_unique(ID, k + 1, next_genotype)); + panel[ID_map.at(ID)][k + 1] = std::make_unique(ID, k + 1, next_genotype); z_next = panel[ID_map.at(ID)][k + 1].get(); tmp = z_k->above; while (tmp->sample_ID != -1 && tmp->genotype != g_k) { @@ -272,12 +287,6 @@ void ThreadsFastLS::insert(const int ID, const std::vector& genotype) { } } -/** - * @brief Deletes sequence ID from the dynamic panel. This moves the last sequence in the panel - * to the position ID held. See alg 5 from d-PBWT paper. - * - * @param ID - */ void ThreadsFastLS::remove(int ID) { Node* s = panel[ID_map.at(ID)][0].get(); // The last sequence in the panel @@ -322,11 +331,7 @@ void ThreadsFastLS::remove(int ID) { num_samples--; } -/** - * @brief For debugging: print the sample-IDs of the arrayified panel. - * - */ -void ThreadsFastLS::print_sorting() { +void ThreadsFastLS::print_sorting() const { for (int j = 0; j < num_sites + 1; ++j) { Node* node = tops[j].get(); while (node != nullptr) { @@ -339,13 +344,8 @@ void ThreadsFastLS::print_sorting() { } } -/** - * Run Li-Stephens on input haplotype *without* inserting into the dynamic panel. - * See also Algorithm 4 of Lunter (2018), Bioinformatics. - * For imputation we use the IMPUTE/Beagle mutation penalties - */ std::pair ThreadsFastLS::fastLS(const std::vector& genotype, - bool imputation) { + bool imputation) { // Get mutation/recombination penalties; std::vector mu; std::vector mu_c; @@ -373,12 +373,11 @@ std::pair ThreadsFastLS::fastLS(const std::vector& // z holds the best current score double z; int max_states = 0; - bool allele; State best_extension = current_states.back(); for (int i = 0; i < num_sites; i++) { bool allele = genotype[i]; - int n_states = current_states.size(); + int n_states = static_cast(current_states.size()); max_states = std::max(n_states, max_states); if (n_states == 0) { std::cerr << "No states left on stack, something is messed up in the algorithm.\n"; @@ -449,8 +448,7 @@ std::pair ThreadsFastLS::fastLS(const std::vector& } // Pruning is turned off by default - if (n_prune >= 0 && i % 100 == 0 && new_states.size() >= n_prune) { - int old_size = new_states.size(); + if ((n_prune >= 0) && (i % 100 == 0) && (static_cast(new_states.size()) >= n_prune)) { StateTree tree = StateTree(new_states); tree.prune(); current_states = tree.dump(); @@ -467,18 +465,14 @@ std::pair ThreadsFastLS::fastLS(const std::vector& [](const auto& s1, const auto& s2) { return s1.score < s2.score; })); if ((num_samples + 1) % 100 == 0) { - std::cout << "Found best path with score " << min_state.score << " for sequence " << num_samples + 1; + std::cout << "Found best path with score " << min_state.score << " for sequence " + << num_samples + 1; std::cout << ", using a maximum of " << max_states << " states.\n"; } return std::pair(min_state.traceback, min_state.below->above); } -/** - * Run Li-Stephens on input diplotype *without* inserting into the dynamic panel. - * See also Algorithm 4 of Lunter (2018), Bioinformatics. - * Warning: The code here is more verbose than it has to be - **/ std::array, 2> ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { // Get mutation/recombination penalties; @@ -506,7 +500,6 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { // z holds the best current score for pairs and individual sequences double z; int max_state_pairs = 0; - bool allele; StatePair best_pair = current_pairs.back(); bool extensible_a0; bool extensible_a1; @@ -516,7 +509,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { // Just like haploid, we iterate through sites for (int i = 0; i < num_sites; i++) { int allele = genotype[i]; - int n_state_pairs = current_pairs.size(); + int n_state_pairs = static_cast(current_pairs.size()); std::vector new_pairs; max_state_pairs = std::max(n_state_pairs, max_state_pairs); if (n_state_pairs == 0) { @@ -571,14 +564,14 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } // Set local minima, this maps (anchor, traceback) to a score - std::unordered_map local_min; - std::unordered_map extmap_0; - std::unordered_map extmap_1; + std::unordered_map local_min; + std::unordered_map extmap_0; + std::unordered_map extmap_1; for (StatePair& p : current_pairs) { - size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); - size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); - double z_pair; + std::size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); + std::size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); + double z_pair = std::numeric_limits::max(); // Set/get extensibility if (extmap_0.count(key_a)) { @@ -649,6 +642,10 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { z_pair = p.score + 2 * mutation_cost; } } + else { + throw std::runtime_error("Illegal genotype value " + std::to_string(allele)); + } + if (!local_min.count(key_a)) { // Set local minima local_min[key_a] = z_pair; @@ -665,16 +662,14 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } // START OF MAIN EXTENSION LOOP - std::unordered_map new_local_min; + std::unordered_map new_local_min; for (StatePair& p : current_pairs) { bool extended = false; - size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); - size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); + std::size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); + std::size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); double z_a = local_min.at(key_a); double z_b = local_min.at(key_b); - Node* a_next; - Node* b_next; extensible_a0 = extmap_0.at(key_a); extensible_a1 = extmap_1.at(key_a); extensible_b0 = extmap_0.at(key_b); @@ -687,8 +682,8 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { if (case1_cost <= std::min({z_a + rho_delta, z_b + rho_delta, z + 2 * rho_delta})) { if (allele == 0) { if (extensible_a0 && extensible_b0) { - a_next = extend_node(p.below_a, 0, i); - b_next = extend_node(p.below_b, 0, i); + Node* a_next = extend_node(p.below_a, 0, i); + Node* b_next = extend_node(p.below_b, 0, i); new_pairs.emplace_back(a_next, b_next, case1_cost, p.traceback_a, p.traceback_b); added_1a.push_back(a_next); added_1b.push_back(b_next); @@ -697,16 +692,16 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } else if (allele == 1) { if (extensible_a0 && extensible_b1) { - a_next = extend_node(p.below_a, 0, i); - b_next = extend_node(p.below_b, 1, i); + Node* a_next = extend_node(p.below_a, 0, i); + Node* b_next = extend_node(p.below_b, 1, i); new_pairs.emplace_back(a_next, b_next, case1_cost, p.traceback_a, p.traceback_b); added_1a.push_back(a_next); added_1b.push_back(b_next); extended = true; } if (extensible_a1 && extensible_b0) { - a_next = extend_node(p.below_a, 1, i); - b_next = extend_node(p.below_b, 0, i); + Node* a_next = extend_node(p.below_a, 1, i); + Node* b_next = extend_node(p.below_b, 0, i); new_pairs.emplace_back(a_next, b_next, case1_cost, p.traceback_a, p.traceback_b); added_1a.push_back(a_next); added_1b.push_back(b_next); @@ -715,8 +710,8 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } else { if (extensible_a1 && extensible_b1) { - a_next = extend_node(p.below_a, 1, i); - b_next = extend_node(p.below_b, 1, i); + Node* a_next = extend_node(p.below_a, 1, i); + Node* b_next = extend_node(p.below_b, 1, i); new_pairs.emplace_back(a_next, b_next, case1_cost, p.traceback_a, p.traceback_b); added_1a.push_back(a_next); added_1b.push_back(b_next); @@ -729,7 +724,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { local_min[key_a] = std::min(z_a, case1_cost); local_min[key_b] = std::min(z_b, case1_cost); for (auto a1 : added_1a) { - size_t new_key_a = pair_key(a1->above->sample_ID, p.traceback_a->site); + std::size_t new_key_a = pair_key(a1->above->sample_ID, p.traceback_a->site); if (new_local_min.count(new_key_a)) { new_local_min[new_key_a] = std::min(new_local_min.at(new_key_a), case1_cost); } @@ -738,7 +733,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } } for (auto b1 : added_1b) { - size_t new_key_b = pair_key(b1->above->sample_ID, p.traceback_b->site); + std::size_t new_key_b = pair_key(b1->above->sample_ID, p.traceback_b->site); if (new_local_min.count(new_key_b)) { new_local_min[new_key_b] = std::min(new_local_min.at(new_key_b), case1_cost); } @@ -797,7 +792,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { local_min[key_a] = std::min(local_min.at(key_a), case2_cost); local_min[key_b] = std::min(local_min.at(key_b), case2_cost); for (auto a2 : added_2a) { - size_t new_key_a = pair_key(a2->above->sample_ID, p.traceback_a->site); + std::size_t new_key_a = pair_key(a2->above->sample_ID, p.traceback_a->site); if (new_local_min.count(new_key_a)) { new_local_min[new_key_a] = std::min(new_local_min.at(new_key_a), case2_cost); } @@ -806,7 +801,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } } for (auto b2 : added_2b) { - size_t new_key_b = pair_key(b2->above->sample_ID, p.traceback_b->site); + std::size_t new_key_b = pair_key(b2->above->sample_ID, p.traceback_b->site); if (new_local_min.count(new_key_b)) { new_local_min[new_key_b] = std::min(new_local_min.at(new_key_b), case2_cost); } @@ -867,7 +862,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { local_min[key_a] = std::min(local_min.at(key_a), case3_cost); local_min[key_b] = std::min(local_min.at(key_b), case3_cost); for (auto a3 : added_3a) { - size_t new_key_a = pair_key(a3->above->sample_ID, p.traceback_a->site); + std::size_t new_key_a = pair_key(a3->above->sample_ID, p.traceback_a->site); if (new_local_min.count(new_key_a)) { new_local_min[new_key_a] = std::min(new_local_min.at(new_key_a), case3_cost); } @@ -876,7 +871,7 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } } for (auto b3 : added_3b) { - size_t new_key_b = pair_key(b3->above->sample_ID, p.traceback_b->site); + std::size_t new_key_b = pair_key(b3->above->sample_ID, p.traceback_b->site); if (new_local_min.count(new_key_b)) { new_local_min[new_key_b] = std::min(new_local_min.at(new_key_b), case3_cost); } @@ -896,12 +891,12 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { } // SINGLE RECOMBINATION EVENTS - std::unordered_set already_recombined; + std::unordered_set already_recombined; std::vector rec_pairs; for (StatePair& p : new_pairs) { - size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); - size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); + std::size_t key_a = pair_key(p.below_a->above->sample_ID, p.traceback_a->site); + std::size_t key_b = pair_key(p.below_b->above->sample_ID, p.traceback_b->site); double recombinant_score = p.score - rho_c[i] + rho[i]; if (!already_recombined.count(key_a) && std::abs(new_local_min.at(key_a) - p.score) < 0.0001) { @@ -967,15 +962,10 @@ ThreadsFastLS::fastLS_diploid(const std::vector& genotype) { std::pair(min_pair.traceback_b, min_pair.below_b->above)}; } -/** - * This is a basic traceback that samples a random haplotype per from all matches per segment and - * also stores the lowest-numbered match (for compression) - */ -std::vector>> ThreadsFastLS::traceback(TracebackState* tb, Node* match, - bool return_all) { +std::vector>> +ThreadsFastLS::traceback(TracebackState* tb, Node* match, bool return_all) { std::vector>> best_path; while (tb != nullptr) { - int n_matches = 1; int segment_start = tb->site; int match_id = match->sample_ID; std::vector div_states = {match_id}; @@ -990,7 +980,6 @@ std::vector>> ThreadsFastLS::traceback(Tracebac break; } div_states.push_back(div_node->sample_ID); - n_matches += 1; if (div_node->sample_ID < min_id) { min_id = div_node->sample_ID; } @@ -1000,7 +989,7 @@ std::vector>> ThreadsFastLS::traceback(Tracebac sampled_states = std::vector(div_states.begin(), div_states.end()); } else { - std::uniform_int_distribution<> distrib(0, div_states.size() - 1); + std::uniform_int_distribution<> distrib(0, static_cast(div_states.size()) - 1); sampled_states.reserve(2); std::sample(div_states.begin(), div_states.end(), std::back_inserter(sampled_states), 1, rng); // Add the min-state as well @@ -1016,17 +1005,12 @@ std::vector>> ThreadsFastLS::traceback(Tracebac return best_path; } -/** - * Similar to normal traceback, but picks the up-to neighborhood_size best matches and stores their overlap with the - * input sequence returns a list of a tuple of lists :-P I.e., a list of segments, and each segment - * is a tuple (sample_IDs, starts, ends) all of equal length <= neighborhood_size - */ std::vector, std::vector, std::vector>> -ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb, Node* match, int neighborhood_size) { +ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb, Node* match, + int neighborhood_size) { std::vector, std::vector, std::vector>> imputation_path; int prev_end = num_sites; while (tb != nullptr) { - int n_matches = 1; int segment_start = tb->site; int segment_end = prev_end; prev_end = segment_start; @@ -1045,7 +1029,6 @@ ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb break; } div_states.push_back(div_node->sample_ID); - n_matches += 1; if (div_node->sample_ID < min_id) { min_id = div_node->sample_ID; } @@ -1054,7 +1037,6 @@ ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb div_node = match->below; while (div_node != nullptr && div_node->divergence <= segment_start) { div_states.push_back(div_node->sample_ID); - n_matches += 1; if (div_node->sample_ID < min_id) { min_id = div_node->sample_ID; } @@ -1070,21 +1052,22 @@ ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb } // initialize original index locations - std::vector idx(overlaps.size()); + std::vector idx(overlaps.size()); std::iota(idx.begin(), idx.end(), 0); // sort indexes based on comparing values in v // using std::stable_sort instead of std::sort // to avoid unnecessary index re-orderings // when v contains elements of equal values - std::stable_sort(idx.begin(), idx.end(), [&overlaps](size_t i1, size_t i2) { + std::stable_sort(idx.begin(), idx.end(), [&overlaps](std::size_t i1, std::size_t i2) { return overlaps[i1].second - overlaps[i1].first < overlaps[i2].second - overlaps[i2].first; }); std::vector segment_starts; std::vector sample_ids; std::vector segment_ends; - for (int j = idx.size() - 1; j >= std::max(0, (int) (idx.size() - neighborhood_size)); j--) { + for (int j = static_cast(idx.size()) - 1; + j >= std::max(0, static_cast(idx.size()) - neighborhood_size); j--) { segment_starts.push_back(segment_start); segment_ends.push_back(segment_end); sample_ids.push_back(div_states[idx[j]]); @@ -1098,16 +1081,6 @@ ThreadsFastLS::traceback_impute(std::vector& genotypes, TracebackState* tb return imputation_path; } -/** - * @brief Determine whether state can be extended through panel by appending g. - * May alter - * @param s State at site i - * @param t_next Node at site i+1 - * @param g Candidate genotype for s at i+1 - * @param i The site index - * @return true - * @return false - */ bool ThreadsFastLS::extensible_by(State& s, const Node* t_next, const bool g, const int i) { int next_above_candidate = t_next->above->sample_ID; @@ -1141,7 +1114,7 @@ std::tuple, std::vector> ThreadsFastLS::mutation_pen std::vector mu_c(num_sites); double mean_bp_size = - (double) (physical_positions.back() - physical_positions[0]) / (double) num_sites; + (physical_positions.back() - physical_positions[0]) / static_cast(num_sites); // The expected branch length const double t = demography.expected_branch_length(num_samples + 1); @@ -1155,11 +1128,6 @@ std::tuple, std::vector> ThreadsFastLS::mutation_pen return std::tuple(mu, mu_c); } -/** - * @brief This gives the IMPUTE5 (and Beagle) recombination penalties - * - * @return tuple of penalty vectors - */ std::tuple, std::vector> ThreadsFastLS::mutation_penalties_impute5() { std::vector mu(num_sites); std::vector mu_c(num_sites); @@ -1173,11 +1141,6 @@ std::tuple, std::vector> ThreadsFastLS::mutation_pen return std::tuple(mu, mu_c); } -/** - * @brief This gives the *sparse* recombination penalties - * - * @return tuple of penalty vectors - */ std::tuple, std::vector> ThreadsFastLS::recombination_penalties() { // Recall: 1cM means the expected average number of intervening // chromosomal crossovers in a single generation is 0.01 @@ -1199,12 +1162,8 @@ std::tuple, std::vector> ThreadsFastLS::recombinatio return std::tuple(rho, rho_c); } -/** - * @brief This gives the correct, dense, recombination penalties - * - * @return tuple of penalty vectors - */ -std::tuple, std::vector> ThreadsFastLS::recombination_penalties_correct() { +std::tuple, std::vector> +ThreadsFastLS::recombination_penalties_correct() { // Recall: 1cM means the expected average number of intervening // chromosomal crossovers in a single generation is 0.01 std::vector rho(num_sites); @@ -1225,51 +1184,37 @@ std::tuple, std::vector> ThreadsFastLS::recombinatio return std::tuple(rho, rho_c); } -/** - * @brief Date the segment based on length and n_mismatches using maximum likelihood. (No - * demography) - * - * @param id1 - * @param id2 - * @param start inclusive - * @param end exclusive - * @return double - */ double ThreadsFastLS::date_segment(const int num_het_sites, const int start, const int end) { if (start > end) { std::cerr << "Can't date a segment with length <= 0\n"; exit(1); } - double m = (double) num_het_sites; double bp_size = 0; double cm_size = 0; for (int i = start; i < end; i++) { bp_size += bp_sizes[i]; cm_size += cm_sizes[i]; } - double mu = 2. * mutation_rate * bp_size; - double rho = 2. * 0.01 * cm_size; if (sparse_sites) { - return ThreadsFastLS::date_segment_sparse(num_het_sites, cm_size, demography); + return ThreadsFastLS::date_segment_sparse(cm_size, demography); } else { - return ThreadsFastLS::date_segment(m, cm_size, bp_size, mutation_rate, demography); + return ThreadsFastLS::date_segment(num_het_sites, cm_size, bp_size, mutation_rate, demography); } } double ThreadsFastLS::date_segment(int num_het_sites, double cm_size, double bp_size, - double mutation_rate, Demography& demography) { + double mutation_rate, Demography& demography) { int m = num_het_sites; double mu = 2. * mutation_rate * bp_size; double rho = 2. * 0.01 * cm_size; if (m > 15) { - // cout << "Warning: very many heterozygous sites, defaulting to const-demography method.\n"; double gamma = 1. / demography.expected_time; return (m + 2) / (gamma + rho + mu); } double numerator = 0; double denominator = 0; - int K = demography.times.size(); + int K = static_cast(demography.times.size()); for (int k = 0; k < K; k++) { double T1 = demography.times[k]; double gamma_k = 1. / demography.sizes[k]; @@ -1296,12 +1241,12 @@ double ThreadsFastLS::date_segment(int num_het_sites, double cm_size, double bp_ return numerator / denominator; } -double ThreadsFastLS::date_segment_sparse(int num_het_sites, double cm_size, Demography& demography) { - int m = num_het_sites; +double ThreadsFastLS::date_segment_sparse(double cm_size, Demography& demography) { + double rho = 2. * 0.01 * cm_size; double numerator = 0; double denominator = 0; - int K = demography.times.size(); + int K = static_cast(demography.times.size()); for (int k = 0; k < K; k++) { double T1 = demography.times[k]; double gamma_k = 1. / demography.sizes[k]; @@ -1364,11 +1309,11 @@ ThreadsFastLS::thread(const int new_sample_ID, const std::vector& genotype std::vector bp_starts; std::vector> target_IDs; std::vector segment_ages; - int total_num_het_sites = 0; // Date segments - for (int i = 0; i < best_path.size(); i++) { + for (int i = 0; i < static_cast(best_path.size()); i++) { int segment_start = std::get<0>(best_path[i]); - int segment_end = (i == best_path.size() - 1) ? num_sites : std::get<0>(best_path[i + 1]); + int segment_end = + (i == (static_cast(best_path.size()) - 1)) ? num_sites : std::get<0>(best_path[i + 1]); std::vector target_ID_L = std::get<1>(best_path[i]); std::vector het_hom_sites = @@ -1380,19 +1325,20 @@ ThreadsFastLS::thread(const int new_sample_ID, const std::vector& genotype num_het_sites++; } } - total_num_het_sites += num_het_sites; - // is it ok to have 100 here? - if (use_hmm && num_samples < 1000) { + + if (use_hmm && num_samples < HMM_SPLIT_THRESHOLD) { // is it ok to have 10 here? if (num_het_sites > 5) { std::vector breakpoints = hmm->breakpoints(het_hom_sites, segment_start); - for (int i = 0; i < breakpoints.size(); i++) { - int breakpoint_start = breakpoints[i]; - int breakpoint_end = (i == breakpoints.size() - 1) ? segment_end : breakpoints[i + 1]; + for (int j = 0; j < static_cast(breakpoints.size()); j++) { + int breakpoint_start = breakpoints[j]; + int breakpoint_end = + (j == (static_cast(breakpoints.size()) - 1)) ? segment_end : breakpoints[j + 1]; target_IDs.push_back(target_ID_L); bp_starts.push_back(static_cast(ceil(bp_boundaries[breakpoint_start]))); - // This is wrong!!! need to actually re-do num het_sites!!!! + // TODO Pass right number of heterozygous sites to date segments when HMM is used (ticket + // #24) segment_ages.push_back(date_segment(num_het_sites, breakpoint_start, breakpoint_end)); } } @@ -1414,7 +1360,8 @@ ThreadsFastLS::thread(const int new_sample_ID, const std::vector& genotype return remove_burn_in(bp_starts, target_IDs, segment_ages, het_sites); } -std::vector ThreadsFastLS::impute(std::vector& genotype, int neighborhood_size) { +std::vector ThreadsFastLS::impute(std::vector& genotype, + int neighborhood_size) { // vector of sample_ids, seg_starts, seg_ends (buffered) std::vector, std::vector, std::vector>> best_path; Node* match; @@ -1434,7 +1381,7 @@ std::vector ThreadsFastLS::impute(std::vector& genotype seg_ages.push_back(date_segment(0, seg_start, seg_end)); } - int num_segs = best_path.size(); + int num_segs = static_cast(best_path.size()); std::vector imputation_segments; // Special case for the first segment @@ -1443,15 +1390,12 @@ std::vector ThreadsFastLS::impute(std::vector& genotype const std::tuple, std::vector, std::vector>& segment = best_path[i]; const std::vector& samples = std::get<0>(segment); ImputationSegment imp_seg; - imp_seg.seg_start = physical_positions[std::get<1>(segment)[0]]; + imp_seg.seg_start = static_cast(physical_positions[std::get<1>(segment)[0]]); imp_seg.ids = samples; - std::vector wts; + std::vector weights(samples.size(), 1. / static_cast(samples.size())); std::vector ages(samples.size(), seg_ages[i]); imp_seg.ages = ages; - for (auto s : samples) { - wts.push_back(1. / samples.size()); - } - imp_seg.weights = wts; + imp_seg.weights = weights; imputation_segments.push_back(imp_seg); } return imputation_segments; @@ -1470,9 +1414,10 @@ ThreadsFastLS::diploid_ls(std::vector unphased_genotypes) { } std::tuple, std::vector>, std::vector, std::vector> -ThreadsFastLS::remove_burn_in(std::vector& bp_starts, std::vector>& target_IDs, - std::vector& segment_ages, std::vector& het_sites) { - int num_segments = bp_starts.size(); +ThreadsFastLS::remove_burn_in(std::vector& bp_starts, + std::vector>& target_IDs, + std::vector& segment_ages, std::vector& het_sites) { + int num_segments = static_cast(bp_starts.size()); std::vector trim_starts; std::vector> trim_IDs; @@ -1482,7 +1427,7 @@ ThreadsFastLS::remove_burn_in(std::vector& bp_starts, std::vector(threading_end) : bp_starts[i + 1]; if (threading_start < seg_end) { break; } @@ -1500,8 +1445,9 @@ ThreadsFastLS::remove_burn_in(std::vector& bp_starts, std::vector(threading_start); trim_IDs = {target_IDs.begin() + seg_start_i, target_IDs.begin() + seg_end_i}; trim_ages = {segment_ages.begin() + seg_start_i, segment_ages.begin() + seg_end_i}; } @@ -1515,16 +1461,8 @@ ThreadsFastLS::remove_burn_in(std::vector& bp_starts, std::vector ThreadsFastLS::overflow_region(const std::vector& genotypes, - const int sample_id, const int segment_start, - const int segment_end) { + const int sample_id, const int segment_start, + const int segment_end) { int overlap_start = segment_start; int overlap_end = segment_end; @@ -1569,11 +1503,8 @@ std::pair ThreadsFastLS::overflow_region(const std::vector& geno return std::pair(overlap_start, overlap_end); } -/** - * Fetch het-hom status for id1 and id2 in the region specified by site indices - */ std::vector ThreadsFastLS::fetch_het_hom_sites(const int id1, const int id2, const int start, - const int end) { + const int end) { if (ID_map.find(id1) == ID_map.end()) { std::cerr << "fetch_het_hom_sites bad id1 " << id1 << std::endl; exit(1); @@ -1593,22 +1524,22 @@ std::vector ThreadsFastLS::fetch_het_hom_sites(const int id1, const int id } // Given threading instructions, find all heterozygous sites -std::vector ThreadsFastLS::het_sites_from_thread(const int focal_ID, - const std::vector bp_starts, - const std::vector> target_IDs) { +std::vector +ThreadsFastLS::het_sites_from_thread(const int focal_ID, const std::vector bp_starts, + const std::vector> target_IDs) { std::vector het_sites; - int num_segments = bp_starts.size(); + int num_segments = static_cast(bp_starts.size()); int site_i = 0; for (int seg_i = 0; seg_i < num_segments; seg_i++) { int segment_start = bp_starts[seg_i]; - int segment_end = - seg_i == num_segments - 1 ? physical_positions.back() + 1 : bp_starts[seg_i + 1]; + int segment_end = seg_i == num_segments - 1 ? (static_cast(physical_positions.back()) + 1) + : bp_starts[seg_i + 1]; int target_ID = target_IDs[seg_i][0]; while (segment_start <= physical_positions[site_i] && physical_positions[site_i] < segment_end && site_i < num_sites) { if (panel[ID_map.at(focal_ID)][site_i]->genotype != panel[ID_map.at(target_ID)][site_i]->genotype) { - het_sites.push_back(physical_positions[site_i]); + het_sites.push_back(static_cast(physical_positions[site_i])); } site_i++; } diff --git a/src/ThreadsFastLS.hpp b/src/ThreadsFastLS.hpp index 0193c30..81a7ce5 100644 --- a/src/ThreadsFastLS.hpp +++ b/src/ThreadsFastLS.hpp @@ -1,8 +1,24 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_THREADS_HPP #define THREADS_ARG_THREADS_HPP -#include "State.hpp" #include "HMM.hpp" +#include "State.hpp" #include #include @@ -12,90 +28,77 @@ struct ImputationSegment { int seg_start = 0; std::vector ids; + // these are at the *start* of the segment std::vector weights; std::vector ages; }; -// This class contains everything for Threads-fastLS +/// This class contains everything for Threads-fastLS class ThreadsFastLS { - -private: - // To access the linked lists in each column - std::vector> tops; - std::vector> bottoms; // Ha, ha - - std::vector> traceback_states; - - // Burn-in quantities - int trim_pos_start_idx = 0; - int trim_pos_end_idx = 0; - - // TODO set random seed at runtime (ticket #20) - std::mt19937 rng{1234}; - - inline size_t pair_key(int i, int j) { - return (size_t) i << 32 | (unsigned int) j; - } +public: + // Constructors and utils + ThreadsFastLS(std::vector _physical_positions, std::vector _genetic_positions, + double _mutation_rate, double ne, bool _sparse_sites, int _n_prune, bool _use_hmm, + int _burn_in_left, int _burn_in_right) + : ThreadsFastLS(_physical_positions, _genetic_positions, _mutation_rate, + std::vector{ne}, std::vector{0.0}, _sparse_sites, _n_prune, + _use_hmm, _burn_in_left, _burn_in_right){}; + ThreadsFastLS(std::vector _physical_positions, std::vector _genetic_positions, + double _mutation_rate, std::vector ne, std::vector ne_times, + bool _sparse_sites, int _n_prune, bool _use_hmm, int _burn_in_left, + int _burn_in_right); + + /// Find the next insert position + /// @param t Pointer to node below the sequence being inserted at site i + /// @param g Allele at site i+1 + /// @param i The site + /// @return Node* Pointer to node below the sequence being inserted at site i+1 Node* extend_node(Node* node, bool genotype, const int i); + + /// Determine whether state can be extended through panel by appending genotype 'g' + /// @param s State at site i + /// @param t_next Node at site i+1 + /// @param g Candidate genotype for s at i+1 + /// @param i The site index + /// @return bool true if extensible bool extensible_by(State& s, const Node* t_next, const bool g, const int i); + std::pair pair_extensible_by(StatePair& p, const bool g, const int i); + + /// @param id1 + /// @param id2 + /// @param start inclusive! + /// @param end exclusive! + /// @return bool whether sequences match on the interval bool genotype_interval_match(const int id1, const int id2, const int start, const int end); + + /// Assuming input genotypes match sample_id on [segment_start, segment_end), how much can we + /// extend the region in either direction with out hitting a mismatch std::pair overflow_region(const std::vector& genotypes, const int sample_id, const int segment_start, const int segment_end); - std::vector fetch_het_hom_sites(const int id1, const int id2, const int start, const int end); - std::vector het_sites_from_thread(const int focal_ID, std::vector bp_starts, - std::vector> target_IDs); -public: - int n_prune; - int num_sites; - int num_samples; - double mutation_rate; - int burn_in_left; - int burn_in_right; - double threading_start; - double threading_end; - // This determines the segment dating formula - bool sparse_sites; - // Whether to internally break up big segments - bool use_hmm; - std::unordered_map ID_map; - std::vector physical_positions; - std::vector genetic_positions; - std::vector bp_sizes; - std::vector cm_sizes; - std::vector bp_boundaries; - std::vector cm_boundaries; - Demography demography; - HMM* hmm; + /// Fetch het-hom status for id1 and id2 in the region specified by site indices + std::vector fetch_het_hom_sites(const int id1, const int id2, const int start, + const int end); - // The dynamic reference panel - std::vector>> panel; - - // Constructors and utils - ThreadsFastLS(std::vector _physical_positions, std::vector _genetic_positions, - double _mutation_rate, double ne, bool _sparse_sites, int _n_prune, bool _use_hmm, - int _burn_in_left, int _burn_in_right) - : ThreadsFastLS(_physical_positions, _genetic_positions, _mutation_rate, std::vector{ne}, - std::vector{0.0}, _sparse_sites, _n_prune, _use_hmm, _burn_in_left, - _burn_in_right){}; - ThreadsFastLS(std::vector _physical_positions, std::vector _genetic_positions, - double _mutation_rate, std::vector ne, std::vector ne_times, - bool _sparse_sites, int _n_prune, bool _use_hmm, int _burn_in_left, int _burn_in_right); + std::vector het_sites_from_thread(const int focal_ID, std::vector bp_starts, + std::vector> target_IDs); static std::tuple, std::vector> site_sizes(std::vector positions); // More attributes - std::vector trimmed_positions(); + std::vector trimmed_positions() const; - // Insertion/deletion - // Insert and assign generic ID + /// Insert and assign generic ID void insert(const std::vector& genotype); - // Insert and assign specific ID + + /// Insert a new sequence into the dynamic panel and assign specific ID void insert(const int ID, const std::vector& genotype); - // Remove sample from panel + + /// Deletes sequence ID from the dynamic panel. This moves the last sequence in the panel + /// to the position ID held. See alg 5 from d-PBWT paper. void remove(int ID); // HMM @@ -112,27 +115,102 @@ class ThreadsFastLS { std::vector>> threads_ls(const std::vector& genotype); std::vector impute(std::vector& genotype, int neighborhood_size); + + /// Run Li-Stephens on input haplotype *without* inserting into the dynamic panel. + /// See also Algorithm 4 of Lunter (2018), Bioinformatics. + /// For imputation we use the IMPUTE/Beagle mutation penalties std::pair fastLS(const std::vector& genotype, bool imputation = false); + + /// This is a basic traceback that samples a random haplotype per from all matches per segment and + /// also stores the lowest-numbered match (for compression) std::vector>> traceback(TracebackState* tb, Node* match, bool return_all = false); + + /// Similar to normal traceback, but picks the up-to neighborhood_size best matches and stores + /// their overlap with the input sequence returns a list of a tuple of lists :-P I.e., a list of + /// segments, and each segment is a tuple (sample_IDs, starts, ends) all of equal length <= + /// neighborhood_size std::vector, std::vector, std::vector>> - traceback_impute(std::vector& genotypes, TracebackState* tb, Node* match, int neighborhood_size); + traceback_impute(std::vector& genotypes, TracebackState* tb, Node* match, + int neighborhood_size); + + /// Run Li-Stephens on input diplotype *without* inserting into the dynamic panel. + /// See also Algorithm 4 of Lunter (2018), Bioinformatics. + /// Warning: The code here is more verbose than it has to be std::array, 2> fastLS_diploid(const std::vector& genotype); + std::array>>, 2> diploid_ls(std::vector unphased_genotypes); + + /// This gives the IMPUTE5 (and Beagle) recombination penalties + /// @return tuple of penalty vectors std::tuple, std::vector> mutation_penalties_impute5(); + + /// This gives the *sparse* recombination penalties + /// @return tuple of penalty vectors std::tuple, std::vector> recombination_penalties(); std::tuple, std::vector> mutation_penalties(); + + /// This gives the correct, dense, recombination penalties + /// @return tuple of penalty vectors std::tuple, std::vector> recombination_penalties_correct(); // TMRCA estimation + /// Date the segment based on length and n_mismatches using maximum likelihood. (No + /// demography) + /// @param id1 + /// @param id2 + /// @param start inclusive + /// @param end exclusive + /// @return double double date_segment(const int num_het_sites, const int start, const int end); + static double date_segment(int num_het_sites, double cm_length, double bp_length, double mutation_rate, Demography& demography); - static double date_segment_sparse(int num_het_sites, double cm_length, Demography& demography); - // Debugging - void print_sorting(); + static double date_segment_sparse(double cm_length, Demography& demography); + + /// For debugging: print the sample-IDs of the arrayified panel + void print_sorting() const; + +public: + int n_prune = 0; + int num_sites = 0; + int num_samples = 0; + double mutation_rate = 0.0; + int burn_in_left = 0; + int burn_in_right = 0; + double threading_start = 0.0; + double threading_end = 0.0; + + bool sparse_sites = false; ///< This determines the segment dating formula + bool use_hmm = false; ///< Whether to internally break up big segments + + std::unordered_map ID_map; + std::vector physical_positions; + std::vector genetic_positions; + std::vector bp_sizes; + std::vector cm_sizes; + std::vector bp_boundaries; + std::vector cm_boundaries; + Demography demography; + HMM* hmm = nullptr; + + // The dynamic reference panel + std::vector>> panel; + +private: + // To access the linked lists in each column + std::vector> tops; + std::vector> bottoms; + std::vector> traceback_states; + + // Burn-in quantities + int trim_pos_start_idx = 0; + int trim_pos_end_idx = 0; + + // TODO set random seed at runtime (ticket #20) + std::mt19937 rng{1234}; }; #endif // THREADS_ARG_THREADS_HPP diff --git a/src/ThreadsLowMem.cpp b/src/ThreadsLowMem.cpp index f40bbf8..c7f5c67 100644 --- a/src/ThreadsLowMem.cpp +++ b/src/ThreadsLowMem.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "ThreadsLowMem.hpp" #include @@ -8,15 +24,14 @@ #include #include - ThreadsLowMem::ThreadsLowMem(const std::vector _target_ids, const std::vector& _physical_positions, const std::vector& _genetic_positions, std::vector ne, std::vector ne_times, double _mutation_rate, bool _sparse) - : target_ids(_target_ids), physical_positions(_physical_positions), - genetic_positions(_genetic_positions), demography(Demography(ne, ne_times)), - mutation_rate(_mutation_rate), sparse(_sparse) { - num_samples = target_ids.size(); + : target_ids(_target_ids), mutation_rate(_mutation_rate), + physical_positions(_physical_positions), genetic_positions(_genetic_positions), + sparse(_sparse), demography(Demography(ne, ne_times)) { + num_samples = static_cast(target_ids.size()); if (physical_positions.size() != genetic_positions.size()) { throw std::runtime_error("Map lengths don't match."); } @@ -24,7 +39,7 @@ ThreadsLowMem::ThreadsLowMem(const std::vector _target_ids, throw std::runtime_error("Need at least 3 sites, found " + std::to_string(physical_positions.size())); } - num_sites = physical_positions.size(); + num_sites = static_cast(physical_positions.size()); // Check maps are strictly increasing for (int i = 0; i < num_sites - 1; i++) { @@ -49,7 +64,8 @@ ThreadsLowMem::ThreadsLowMem(const std::vector _target_ids, } // Mean interval size in base-pairs - mean_bp_size = (double) (physical_positions.back() - physical_positions[0]) / (num_sites - 1); + mean_bp_size = + (physical_positions.back() - physical_positions[0]) / static_cast(num_sites - 1); for (int target_id : target_ids) { segment_indices[target_id] = 0; expected_branch_lengths[target_id] = demography.expected_branch_length(target_id + 1); @@ -60,7 +76,7 @@ ThreadsLowMem::ThreadsLowMem(const std::vector _target_ids, het_sites_processed = 0; std::tie(bp_boundaries, bp_sizes) = ThreadsFastLS::site_sizes(physical_positions); - for (int i = 0; i < genetic_positions.size(); i++) { + for (std::size_t i = 0; i < genetic_positions.size(); i++) { if (i == genetic_positions.size() - 1) { // TO allow for closely spaced markers cm_sizes.push_back(0.0000001); @@ -87,7 +103,7 @@ void ThreadsLowMem::initialize_viterbi(std::vector& genotype) { bool group_change = false; - if (match_group_idx < match_groups.size() - 1 && - genetic_positions.at(hmm_sites_processed) >= - match_groups.at(match_group_idx + 1).cm_position) { + if (match_group_idx < (static_cast(match_groups.size()) - 1) && + (genetic_positions.at(hmm_sites_processed) >= + match_groups.at(match_group_idx + 1).cm_position)) { match_group_idx++; group_change = true; } @@ -150,7 +166,6 @@ void ThreadsLowMem::traceback() { } void ThreadsLowMem::process_site_hets(const std::vector& genotype) { - for (int target_id : target_ids) { if (target_id == 0) { if (genotype.at(0) == 1) { @@ -160,8 +175,8 @@ void ThreadsLowMem::process_site_hets(const std::vector& genotype) { else { ViterbiPath& path = paths.at(target_id); int current_seg_idx = segment_indices.at(target_id); - while (current_seg_idx < path.segment_starts.size() - 1 && - het_sites_processed >= path.segment_starts.at(current_seg_idx + 1)) { + while (current_seg_idx < (static_cast(path.segment_starts.size()) - 1) && + (het_sites_processed >= path.segment_starts.at(current_seg_idx + 1))) { current_seg_idx++; } segment_indices.at(target_id) = current_seg_idx; @@ -184,7 +199,7 @@ void ThreadsLowMem::date_segments() { throw std::runtime_error( "Can't date segments, not all sites have been parsed for heterozygosity."); } - // for (int i = 1; i < num_samples; i++) { + for (int target_id : target_ids) { if (target_id == 0) { continue; @@ -198,54 +213,41 @@ void ThreadsLowMem::date_segments() { } } - // I get segfault on this? - - // for (int i = 1; i < num_samples; i++) { for (int target_id : target_ids) { if (target_id == 0) { continue; } - // cout << "dating sample " << i << endl; - // compute length in bp/cM and num_hets for each segment ViterbiPath& path = paths.at(target_id); ViterbiPath new_path(target_id); - int n_segs = path.segment_starts.size(); - for (int k = 0; k < n_segs; k++) { + std::size_t n_segs = path.segment_starts.size(); + for (std::size_t k = 0; k < n_segs; k++) { int sample_id = path.sample_ids.at(k); int segment_start = path.segment_starts.at(k); int segment_end = k < n_segs - 1 ? path.segment_starts.at(k + 1) : num_sites - 1; - // cout << "dating segment [" << segment_start << ", " << segment_end << ") no. " << k << " - // out of " << n_segs - 1 << endl; if (segment_end == segment_start) { continue; } - double bp_size = physical_positions.at(segment_end) - physical_positions.at(segment_start); - double cm_size = genetic_positions.at(segment_end) - genetic_positions.at(segment_start); // This is inefficient but probably not that bad std::vector segment_hets; for (int h : path.het_sites) { - if (segment_start <= h && h < segment_end || - h == num_sites - 1 && segment_end == num_sites - 1) { + if (((segment_start <= h) && (h < segment_end)) || + ((h == num_sites - 1) && (segment_end == num_sites - 1))) { segment_hets.push_back(h); } else if (h >= segment_end) { break; } } - // cout << "found bpsize: " << bp_size << " cmsize " << cm_size << " nhets " << - // segment_hets.size() << " sample " << sample_id << endl; disabling hmm for now - if (target_id < n_hmm_samples && segment_hets.size() > hmm_min_sites) { - // cout << "running hmm on segment [" << segment_start << ", " << segment_end << ")... "; + + if ((target_id < n_hmm_samples) && (static_cast(segment_hets.size()) > hmm_min_sites)) { std::vector het_hom_sites(segment_end - segment_start, false); for (int h : segment_hets) { - // cout << h << endl; het_hom_sites[h - segment_start] = true; } // Here we use the hmm to break the big segment up into smaller segments std::vector breakpoints = psmc.breakpoints(het_hom_sites, segment_start); - // cout << "found " << breakpoints.size() << " breakpoints" << endl; - for (int j = 0; j < breakpoints.size(); j++) { + for (std::size_t j = 0; j < breakpoints.size(); j++) { int breakpoint_start = breakpoints[j]; int breakpoint_end = (j == breakpoints.size() - 1) ? segment_end : breakpoints[j + 1]; // there may be off-by-one errors here on the last segment (but who cares?) @@ -257,16 +259,16 @@ void ThreadsLowMem::date_segments() { // Same as above std::vector breakpoint_hets; for (int h : segment_hets) { - if (breakpoint_start <= h && h < breakpoint_end || - h == num_sites - 1 && breakpoint_end == num_sites - 1) { + if (((breakpoint_start <= h) && (h < breakpoint_end)) || + ((h == num_sites - 1) && (breakpoint_end == num_sites - 1))) { breakpoint_hets.push_back(h); } else if (h >= breakpoint_end) { break; } } - double height = ThreadsFastLS::date_segment( - breakpoint_hets.size(), cm_size, bp_size, mutation_rate, demography); + double height = ThreadsFastLS::date_segment(static_cast(breakpoint_hets.size()), + cm_size, bp_size, mutation_rate, demography); new_path.append(breakpoint_start, sample_id, height, breakpoint_hets); } } @@ -274,8 +276,8 @@ void ThreadsLowMem::date_segments() { // there are off-by-one errors here on the last segment (but who cares?) double bp_size = physical_positions.at(segment_end) - physical_positions.at(segment_start); double cm_size = genetic_positions.at(segment_end) - genetic_positions.at(segment_start); - double height = - ThreadsFastLS::date_segment(segment_hets.size(), cm_size, bp_size, mutation_rate, demography); + double height = ThreadsFastLS::date_segment( + static_cast(segment_hets.size()), cm_size, bp_size, mutation_rate, demography); new_path.append(segment_start, sample_id, height, segment_hets); } } @@ -284,9 +286,8 @@ void ThreadsLowMem::date_segments() { return; } -int ThreadsLowMem::count_branches() { +int ThreadsLowMem::count_branches() const { int n_branches = 0; - // for (int i = 1; i < num_samples; i++) { for (int target_id : target_ids) { if (target_id == 0) { continue; @@ -297,7 +298,6 @@ int ThreadsLowMem::count_branches() { } void ThreadsLowMem::prune() { - // for (int i = 1; i < num_samples; i++) { for (int target_id : target_ids) { if (target_id == 0) { continue; diff --git a/src/ThreadsLowMem.hpp b/src/ThreadsLowMem.hpp index 222fa7c..182d694 100644 --- a/src/ThreadsLowMem.hpp +++ b/src/ThreadsLowMem.hpp @@ -1,52 +1,33 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_THREADS_LOW_MEM_HPP #define THREADS_ARG_THREADS_LOW_MEM_HPP +#include "Demography.hpp" #include "Matcher.hpp" #include "ThreadsFastLS.hpp" #include "ViterbiLowMem.hpp" -#include "Demography.hpp" #include #include #include #include class ThreadsLowMem { - -private: - Demography demography; - - // 2. HMM quantites - int hmm_sites_processed; - std::unordered_map hmms; - int match_group_idx; - std::vector match_groups; - - // 3. Path segment and path dating quantites - HMM psmc; - // std::vector cm_sizes; - // std::vector bp_sizes; - int het_sites_processed; - int n_hmm_samples = 100; - int hmm_min_sites = 10; - public: - // This object will only run the HMM for these ids - std::vector target_ids; - std::unordered_map expected_branch_lengths; - double mean_bp_size; - std::unordered_map segment_indices; - std::unordered_map paths; - int num_samples; - int num_sites; - double mutation_rate; - std::vector physical_positions; - std::vector genetic_positions; - std::vector bp_sizes; - std::vector cm_sizes; - std::vector bp_boundaries; - std::vector cm_boundaries; - bool sparse; - ThreadsLowMem(const std::vector _target_ids, const std::vector& _physical_positions, const std::vector& _genetic_positions, std::vector ne, std::vector ne_times, double _mutation_rate, bool _sparse); @@ -72,12 +53,45 @@ class ThreadsLowMem { // 4. save output (done on Python side) - int count_branches(); + int count_branches() const; // Make pickle-able path output std::tuple>, std::vector>, std::vector>, std::vector>> serialize_paths(); + +public: + // This object will only run the HMM for these ids + std::vector target_ids; + std::unordered_map expected_branch_lengths; + double mean_bp_size = 0.0; + std::unordered_map segment_indices; + std::unordered_map paths; + int num_samples = 0; + int num_sites = 0; + double mutation_rate = 0.0; + std::vector physical_positions; + std::vector genetic_positions; + std::vector bp_sizes; + std::vector cm_sizes; + std::vector bp_boundaries; + std::vector cm_boundaries; + bool sparse = false; + +private: + Demography demography; + + // 2. HMM quantites + int hmm_sites_processed = 0; + std::unordered_map hmms; + int match_group_idx = 0; + std::vector match_groups; + + // 3. Path segment and path dating quantites + HMM psmc; + int het_sites_processed = 0; + int n_hmm_samples = 100; + int hmm_min_sites = 10; }; #endif // THREADS_ARG_THREADS_LOW_MEM_HPP diff --git a/src/ViterbiLowMem.cpp b/src/ViterbiLowMem.cpp index 4c8106a..27273f5 100644 --- a/src/ViterbiLowMem.cpp +++ b/src/ViterbiLowMem.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "ViterbiLowMem.hpp" #include #include @@ -7,12 +23,21 @@ #include #include +namespace { + +const int ALLELE_UNPHASED_HET = -7; + +inline std::size_t coord_id_key(int i, int j) { + return (static_cast(i) << 32) | static_cast(j); +} + +} // namespace TracebackNode::TracebackNode(int _sample_id, int _site, TracebackNode* _previous, double _score) : sample_id(_sample_id), site(_site), previous(_previous), score(_score) { } -size_t TracebackNode::key() { +std::size_t TracebackNode::key() const { return coord_id_key(site, sample_id); } @@ -31,8 +56,8 @@ void ViterbiPath::reverse() { std::reverse(sample_ids.begin(), sample_ids.end()); } -int ViterbiPath::size() { - return segment_starts.size(); +int ViterbiPath::size() const { + return static_cast(segment_starts.size()); } void ViterbiPath::append(int segment_start, int sample_id) { @@ -68,7 +93,7 @@ void ViterbiPath::map_positions(std::vector& positions) { std::tuple, std::vector, std::vector> ViterbiPath::dump_data_in_range(int start, int end) { int n_segs = size(); - if (start == -1 && end == -1 || n_segs == 0) { + if (((start == -1) && (end == -1)) || (n_segs == 0)) { return std::tuple, std::vector, std::vector>( bp_starts, sample_ids, heights); } @@ -82,8 +107,8 @@ ViterbiPath::dump_data_in_range(int start, int end) { for (int i = 0; i < n_segs; i++) { int seg_start = bp_starts.at(i); int seg_end = i < n_segs - 1 ? bp_starts.at(i + 1) : tmp_end; - if (seg_start <= tmp_start && tmp_start < seg_end || - tmp_start <= seg_start && seg_start < tmp_end) { + if (((seg_start <= tmp_start) && (tmp_start < seg_end)) || + ((tmp_start <= seg_start) && (seg_start < tmp_end))) { out_starts.push_back(std::max(tmp_start, seg_start)); out_ids.push_back(sample_ids.at(i)); out_heights.push_back(heights.at(i)); @@ -104,7 +129,7 @@ ViterbiState::ViterbiState(int _target_id, std::vector _sample_ids) std::to_string(target_id)); } for (int sample_id : sample_ids) { - size_t key = coord_id_key(0, sample_id); + std::size_t key = coord_id_key(0, sample_id); traceback_states.emplace(key, TracebackNode(sample_id, 0, nullptr, 0.)); current_tracebacks[sample_id] = &traceback_states.at(key); } @@ -112,7 +137,6 @@ ViterbiState::ViterbiState(int _target_id, std::vector _sample_ids) best_match = sample_ids.at(0); } -// -9 is "missing" (not implemented yet) and -7 is "unphased" void ViterbiState::process_site(const std::vector& genotype, double rho, double rho_c, double mu, double mu_c) { int current_site = sites_processed; @@ -124,8 +148,7 @@ void ViterbiState::process_site(const std::vector& genotype, double rho, do for (int sample_id : sample_ids) { int allele = genotype.at(sample_id); double copy_penalty; - // "-7" encodes unphased heterozygotes, because why not - if (allele == -7 || observed_allele == -7) { + if ((allele == ALLELE_UNPHASED_HET) || (observed_allele == ALLELE_UNPHASED_HET)) { copy_penalty = (mu_c + mu) / 2.; } else { @@ -135,7 +158,7 @@ void ViterbiState::process_site(const std::vector& genotype, double rho, do // If we've just added new sites (this will happen vary rarely), // recombine from previous best state new_score = best_score + copy_penalty + rho; - size_t key = coord_id_key(current_site, sample_id); + std::size_t key = coord_id_key(current_site, sample_id); traceback_states.emplace(key, TracebackNode(sample_id, current_site, prev_best, new_score)); current_tracebacks[sample_id] = &traceback_states.at(key); } @@ -150,7 +173,7 @@ void ViterbiState::process_site(const std::vector& genotype, double rho, do else { // If we recombine, add a new branch new_score = best_score + copy_penalty + rho; - size_t key = coord_id_key(current_site, sample_id); + std::size_t key = coord_id_key(current_site, sample_id); traceback_states.emplace(key, TracebackNode(sample_id, current_site, prev_best, new_score)); current_tracebacks.at(sample_id) = &traceback_states.at(key); } @@ -180,7 +203,7 @@ void ViterbiState::set_samples(std::unordered_set new_sample_ids) { } void ViterbiState::prune() { - std::unordered_map tmp_traceback_states; + std::unordered_map tmp_traceback_states; for (int sample_id : sample_ids) { TracebackNode* state = current_tracebacks.at(sample_id); @@ -197,12 +220,13 @@ void ViterbiState::prune() { } // add everything above and return a key to the new address -TracebackNode* ViterbiState::recursive_insert(std::unordered_map& state_map, - TracebackNode* state) { +TracebackNode* +ViterbiState::recursive_insert(std::unordered_map& state_map, + TracebackNode* state) { if (state == nullptr) { return nullptr; } - size_t key = state->key(); + std::size_t key = state->key(); if (!state_map.count(key)) { TracebackNode* parent = state->previous; TracebackNode* new_parent = recursive_insert(state_map, parent); @@ -211,8 +235,8 @@ TracebackNode* ViterbiState::recursive_insert(std::unordered_map(traceback_states.size()); } ViterbiPath ViterbiState::traceback() { diff --git a/src/ViterbiLowMem.hpp b/src/ViterbiLowMem.hpp index a3352be..9a288ea 100644 --- a/src/ViterbiLowMem.hpp +++ b/src/ViterbiLowMem.hpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #ifndef THREADS_ARG_VITERBI_LOW_MEM_HPP #define THREADS_ARG_VITERBI_LOW_MEM_HPP @@ -7,30 +23,19 @@ #include class TracebackNode { -private: - inline size_t coord_id_key(int i, int j) { - return (size_t) i << 32 | (unsigned int) j; - } +public: + TracebackNode(int _sample_id, int _site, TracebackNode* _previous, double _score); + std::size_t key() const; public: int sample_id = 0; int site = 0; - double score = 0.0; TracebackNode* previous = nullptr; - TracebackNode(int _sample_id, int _site, TracebackNode* _previous, double _score); - size_t key(); + double score = 0.0; }; class ViterbiPath { -private: public: - double score = 0.0; - int target_id = 0; - std::vector bp_starts; - std::vector segment_starts; - std::vector sample_ids; - std::vector heights; - std::vector het_sites; ViterbiPath(int _target_id); ViterbiPath(int _target_id, std::vector _segment_starts, std::vector _sample_ids, std::vector _heights, std::vector _het_sites); @@ -39,18 +44,33 @@ class ViterbiPath { std::tuple, std::vector, std::vector> dump_data_in_range(int start_idx, int end_idx); void reverse(); - int size(); + int size() const; void map_positions(std::vector& positions); + +public: + double score = 0.0; + int target_id = 0; + std::vector bp_starts; + std::vector segment_starts; + std::vector sample_ids; + std::vector heights; + std::vector het_sites; }; class ViterbiState { +public: + ViterbiState(int _target_id, std::vector _sample_ids); + + void process_site(const std::vector& genotype, double rho, double rho_c, double _mu, + double _mu_c); + void set_samples(std::unordered_set new_sample_ids); + int count_branches() const; + void prune(); + ViterbiPath traceback(); private: - inline size_t coord_id_key(int i, int j) { - return (size_t) i << 32 | (unsigned int) j; - } - std::unordered_map traceback_states; - TracebackNode* recursive_insert(std::unordered_map& state_map, + std::unordered_map traceback_states; + TracebackNode* recursive_insert(std::unordered_map& state_map, TracebackNode* state); public: @@ -62,15 +82,6 @@ class ViterbiState { std::vector sample_ids; std::vector sample_scores; std::unordered_map current_tracebacks; - - ViterbiState(int _target_id, std::vector _sample_ids); - - void process_site(const std::vector& genotype, double rho, double rho_c, double _mu, - double _mu_c); - void set_samples(std::unordered_set new_sample_ids); - int count_branches(); - void prune(); - ViterbiPath traceback(); }; #endif // THREADS_ARG_VITERBI_LOW_MEM_HPP diff --git a/src/threads_arg/__init__.py b/src/threads_arg/__init__.py index 580401a..6ef93f0 100644 --- a/src/threads_arg/__init__.py +++ b/src/threads_arg/__init__.py @@ -1 +1,17 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + from threads_arg_python_bindings import * diff --git a/src/threads_arg/__main__.py b/src/threads_arg/__main__.py index 380a079..62a579c 100644 --- a/src/threads_arg/__main__.py +++ b/src/threads_arg/__main__.py @@ -1,3 +1,19 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + import os import gc import sys @@ -263,9 +279,9 @@ def partial_viterbi(pgen, mode, num_samples_hap, physical_positions, genetic_pos phased_out = np.empty((g_size, num_samples_hap // 2), dtype=np.uint8) if (phased_out == 0).any(): unphased_sites, unphased_samples = (1 - phased_out).nonzero() - # We encode "unphased het" by "-7" - alleles_out[unphased_sites, 2 * unphased_samples] = -7 - alleles_out[unphased_sites, 2 * unphased_samples + 1] = -7 + ALLELE_UNPHASED_HET = -7 + alleles_out[unphased_sites, 2 * unphased_samples] = ALLELE_UNPHASED_HET + alleles_out[unphased_sites, 2 * unphased_samples + 1] = ALLELE_UNPHASED_HET reader.read_alleles_and_phasepresent_range(b_start, b_end, alleles_out, phased_out) # For each variant in chunk, pass the genotypes through Threads-LS diff --git a/src/threads_arg/mapping_utils.py b/src/threads_arg/mapping_utils.py index 3dc00e0..9ae4d5a 100644 --- a/src/threads_arg/mapping_utils.py +++ b/src/threads_arg/mapping_utils.py @@ -1,3 +1,19 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + import arg_needle_lib import numpy as np import logging diff --git a/src/threads_arg/utils.py b/src/threads_arg/utils.py index 311f5c5..f355c50 100644 --- a/src/threads_arg/utils.py +++ b/src/threads_arg/utils.py @@ -1,3 +1,19 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + import os import numpy as np import h5py @@ -54,9 +70,9 @@ def interpolate_map(map_gz, pgen): Reading in map file (format has columns [chrom, SNP, cM-pos, bp]) """ if (map_gz[:-3] == ".gz") : - maps = pd.read_table(map_gz, header=None, compression='gzip', delim_whitespace=True) + maps = pd.read_table(map_gz, header=None, compression='gzip', sep="\\s+") else: - maps = pd.read_table(map_gz, header=None, delim_whitespace=True) + maps = pd.read_table(map_gz, header=None, sep="\\s+") cm_pos_map = maps[2].values.astype(np.float64) phys_pos_map = maps[3].values.astype(np.float64) pvar = pgen.replace("pgen", "pvar") @@ -64,9 +80,9 @@ def interpolate_map(map_gz, pgen): physical_positions = None if os.path.isfile(bim): - physical_positions = np.array(pd.read_table(bim, delim_whitespace=True, header=None, comment='#')[3]).astype(np.float64) + physical_positions = np.array(pd.read_table(bim, sep="\\s+", header=None, comment='#')[3]).astype(np.float64) elif os.path.isfile(pvar): - physical_positions = np.array(pd.read_table(pvar, delim_whitespace=True, header=None, comment='#')[1]).astype(np.float64) + physical_positions = np.array(pd.read_table(pvar, sep="\\s+", header=None, comment='#')[1]).astype(np.float64) else: raise RuntimeError(f"Can't find {bim} or {pvar}") @@ -87,10 +103,10 @@ def get_map_from_bim(pgen, rho): cm_out = None physical_positions = None if os.path.isfile(bim): - physical_positions = np.array(pd.read_table(bim, delim_whitespace=True, header=None, comment='#')[3]).astype(int) + physical_positions = np.array(pd.read_table(bim, sep="\\s+", header=None, comment='#')[3]).astype(int) cm_out = rho * 100 * physical_positions elif os.path.isfile(pvar): - physical_positions = np.array(pd.read_table(pvar, delim_whitespace=True, header=None, comment='#')[1]).astype(int) + physical_positions = np.array(pd.read_table(pvar, sep="\\s+", header=None, comment='#')[1]).astype(int) cm_out = rho * 100 * physical_positions else: raise RuntimeError(f"Can't find {bim} or {pvar}") @@ -101,5 +117,5 @@ def get_map_from_bim(pgen, rho): return cm_out, physical_positions def parse_demography(demography): - d = pd.read_table(demography, delim_whitespace=True, header=None) + d = pd.read_table(demography, sep="\\s+", header=None) return list(d[0]), list(d[1]) diff --git a/src/threads_arg_pybind.cpp b/src/threads_arg_pybind.cpp index 9ac0c77..4168f65 100644 --- a/src/threads_arg_pybind.cpp +++ b/src/threads_arg_pybind.cpp @@ -1,8 +1,23 @@ -#include "TGEN.hpp" +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "ImputationMatcher.hpp" +#include "TGEN.hpp" #include "ThreadsLowMem.hpp" -#include #include #include #include @@ -63,7 +78,8 @@ PYBIND11_MODULE(threads_arg_python_bindings, m) { py::class_(m, "Matcher") .def(py::init, double, double, int, int>(), "Initialize", py::arg("num_samples"), py::arg("genetic_positions"), py::arg("query_interval_size"), - py::arg("match_group_interval_size"), py::arg("neighborhood_size") = 4, py::arg("min_matches") = 4) + py::arg("match_group_interval_size"), py::arg("neighborhood_size") = 4, + py::arg("min_matches") = 4) .def_readonly("query_sites", &Matcher::query_sites) .def_readonly("genetic_positions", &Matcher::genetic_positions) .def_readonly("query_interval_size", &Matcher::query_interval_size) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0dddbfc..7614629 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,18 @@ -# This file is part of https://github.com/PalamaraLab/threads which is released under the GPL-3.0 license. -# See accompanying LICENSE and COPYING for copyright notice and full details. +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . # Get Catch2 Include(FetchContent) diff --git a/test/test_data_snapshot.py b/test/test_data_snapshot.py index 9c9cb4f..318b76f 100644 --- a/test/test_data_snapshot.py +++ b/test/test_data_snapshot.py @@ -1,3 +1,19 @@ +# This file is part of the Threads software suite. +# Copyright (C) 2024 Threads Developers. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + import numpy as np import h5py import tempfile diff --git a/test/test_demography.cpp b/test/test_demography.cpp index fa9b815..c2edca1 100644 --- a/test/test_demography.cpp +++ b/test/test_demography.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "Demography.hpp" #include @@ -30,6 +46,6 @@ TEST_CASE("Demography constructor exceptions") { TEST_CASE("Demography std_to_gen exceptions") { SECTION("std_to_gen value must be non-negative") { Demography d{{1}, {0}}; - CHECK_THROWS_WITH(d.std_to_gen(-1), Catch::Matchers::ContainsSubstring("Demography can only convert non-negative times to std")); + CHECK_THROWS_WITH(d.std_to_gen(-1), Catch::Matchers::ContainsSubstring("Demography can only convert times greater than the first entry")); } } diff --git a/test/test_viterbi_state.cpp b/test/test_viterbi_state.cpp index e1bdabe..3356a6d 100644 --- a/test/test_viterbi_state.cpp +++ b/test/test_viterbi_state.cpp @@ -1,3 +1,19 @@ +// This file is part of the Threads software suite. +// Copyright (C) 2024 Threads Developers. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "HMM.hpp" #include