first public commit

Weeks-UNC · Apr 9, 2018 · 17e6fe2 · 17e6fe2
commit 17e6fe2
Show file tree

Hide file tree

Showing 550 changed files with 213,536 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+thirdparty/
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright 2017 Steven Busan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.html b/README.html
diff --git a/README.md b/README.md
diff --git a/bin/check_fasta_format.py b/bin/check_fasta_format.py
@@ -0,0 +1,149 @@
+# --------------------------------------------------------------------- #
+#  This file is a part of ShapeMapper, and is licensed under the terms  #
+#  of the MIT license. Copyright 2017 Steven Busan.                     #
+# --------------------------------------------------------------------- #
+
+
+import sys, os
+
+
+def check_fasta(fasta_path,
+                corrected_fasta_path):
+    """
+    Check a fasta file for format requirements, raise errors if needed.
+    Attempt to correct errors and write to a new file. Return True if no
+    errors, False otherwise.
+
+    """
+
+    fa = open(fasta_path,"r")
+    read = fa.read()
+    fa.seek(0)
+    lines = fa.readlines()
+    fa.close()
+
+    corrected_msgs = []
+    uncorrected_msgs = []
+
+    if "\r" in read:
+        lines = [line.strip()+"\n" for line in lines]
+        corrected_msgs.append("Line endings not in unix format.")
+
+    if len(lines) < 2:
+        uncorrected_msgs.append("Not enough lines (need at least a header and a sequence).")
+
+    found_U = False
+    found_whitespace = False
+    found_nonalpha = False
+    missing_header = False
+    missing_seq = False
+    extra_headers = False
+    dup_headers = False
+
+    fixed_lines = [str(line) for line in lines]
+    break_flag = False
+    headers = []
+    header_count = 0
+    seq_count = 0
+    for i in range(len(fixed_lines)):
+        if fixed_lines[i][0] == '>':
+            if header_count != seq_count:
+                extra_headers = True
+                break
+            headers.append(lines[i][1:].rstrip())
+            header_count += 1
+            continue
+        elif len(lines[i].strip()) != 0:
+            if header_count == seq_count+1:
+                seq_count += 1
+            elif header_count == 0:
+                missing_header = True
+        for j in range(len(fixed_lines[i])):
+            c = fixed_lines[i][j]
+            if c == "\n":
+                continue
+            elif c.upper() == "U":
+                found_U = True
+            elif c == ' ':
+                found_whitespace = True
+            elif not c.isalpha():
+                if header_count == 0:
+                    missing_header = True
+                    break_flag = True
+                    break
+                else:
+                    found_nonalpha = True
+                    break_flag = True
+                    break
+        if found_whitespace or found_U:
+            fixed_lines[i] = fixed_lines[i].replace('u', 't').replace('U','T').replace(' ','')
+        if break_flag:
+            break
+    if header_count < seq_count:
+        missing_header = True
+    elif header_count > seq_count:
+        missing_seq = True
+    if len(list(set(headers))) != len(headers):
+        dup_headers = True
+
+    if found_U:
+        msg = "One or more 'U' found in sequence (should be 'T')."
+        corrected_msgs.append(msg)
+    if found_whitespace:
+        msg = "Space(s) found within sequence (STAR aligner interprets these as Ns)."
+        corrected_msgs.append(msg)
+    if found_nonalpha:
+        msg = "One or more non-alphabetic characters found in sequence"
+        msg += " or sequence name is misformatted (must start with '>')."
+        uncorrected_msgs.append(msg)
+    if missing_header:
+        msg = "Missing header(s). Preceding each sequence, each sequence"
+        msg += " name should be on its own line and start with the '>' character."
+        uncorrected_msgs.append(msg)
+    if missing_seq:
+        msg = "Missing one or more sequences."
+        uncorrected_msgs.append(msg)
+    if extra_headers:
+        msg = "Extra header(s), missing sequence(s), or headers in"
+        msg += " wrong location. Preceding each sequence, each sequence"
+        msg += " name should be on its own line and start with the '>' character."
+        uncorrected_msgs.append(msg)
+    if dup_headers:
+        msg = "Duplicated sequence names."
+        uncorrected_msgs.append(msg)
+    msg = ''
+
+    if len(uncorrected_msgs) == 0:
+        fa = open(corrected_fasta_path,"w")
+        fa.write("".join(fixed_lines))
+        fa.close()
+    if len(corrected_msgs) > 0 and len(uncorrected_msgs) == 0:
+        msg += "ERROR: FASTA file {} contains errors:\n".format(fasta_path)
+        for m in corrected_msgs:
+            msg += " - {}\n".format(m)
+        msg += "These errors were corrected and written to {}\n".format(corrected_fasta_path)
+        msg += "Replace the original FASTA file with the new one\n"
+        msg += "before rerunning shapemapper.\n"
+    if len(uncorrected_msgs) > 0:
+        msg += "ERROR: FASTA file {} contains errors:\n".format(fasta_path)
+        for m in uncorrected_msgs:
+            msg += " - {}\n".format(m)
+        for m in corrected_msgs:
+            msg += " - {}\n".format(m)
+    if len(msg) > 0:
+        sys.stderr.write(msg)
+        return False
+    else:
+        return True
+
+
+if __name__=="__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python3.5 check_fasta_format.py <example.fa> <corrected.fa>")
+        sys.exit(0)
+    success = check_fasta(sys.argv[1],
+                          sys.argv[2])
+    if success:
+        print("No formatting errors detected.")
+    else:
+        sys.exit(1)
diff --git a/bin/compare_fasta.py b/bin/compare_fasta.py
@@ -0,0 +1,70 @@
+"""
+Check if two fasta files have identical sequences
+
+"""
+
+# --------------------------------------------------------------------- #
+#  This file is a part of ShapeMapper, and is licensed under the terms  #
+#  of the MIT license. Copyright 2017 Steven Busan.                     #
+# --------------------------------------------------------------------- #
+
+import sys
+import shutil
+import traceback
+
+
+def load_fasta(fastaname, rna=None, convert_to_rna=True):
+    f = open(fastaname, "rU")
+    seq = ""
+    rna_count = 0
+    in_selected_rna = False
+    seq_name = None
+    if rna is None:
+        in_selected_rna = True
+    for line in f:
+        if line.startswith(">"):
+            name = line[1:].strip()
+            rna_count += 1
+            if len(seq) > 0:
+                break
+            if rna_count > 1 and rna is None:
+                s = "Error: fasta file "
+                s += "\"" + fastaname + "\""
+                s += " contains more than one sequence, but no"
+                s += " sequence name was specified."
+                raise RuntimeError(s)
+            if name == rna:
+                in_selected_rna = True
+            seq_name = name
+            continue
+        if in_selected_rna:
+            seq += line.strip()
+    if convert_to_rna:
+        seq = seq.replace("T", "U")
+    return seq_name, seq
+
+try:
+    if len(sys.argv)<3:
+        print("Usage: python compare_fasta.py <seq1>.fa <seq2>.fa")
+        sys.exit()
+
+    _, s1 = load_fasta(sys.argv[1])
+    _, s2 = load_fasta(sys.argv[2])
+
+    term_width, term_height = shutil.get_terminal_size()
+    s = "".join(['=' for x in range(term_width)])
+    #print(s)
+    if s1 == s2:
+        #sys.stdout.write("PASS: Corrected sequence matches original sequence.\n")
+        sys.exit(0)
+    else:
+        sys.stderr.write("ERROR: Corrected sequence does not match original sequence.\n")
+        sys.stderr.write(s1+"\n")
+        sys.stderr.write(s2+"\n")
+        sys.exit(0)
+    #print(s)
+except Exception as e:
+    if isinstance(e, KeyboardInterrupt):
+        raise Exception(e)
+    sys.stderr.write("ERROR: "+traceback.format_exc())
+    sys.stderr.write("{}".format(e))
diff --git a/bin/get_sequence_lengths.py b/bin/get_sequence_lengths.py
@@ -0,0 +1,57 @@
+"""
+Given a sequence name and a fasta file containing one or more sequences,
+output the length of each sequence to separate files. Used to dynamically update
+sequence length parameters for pipeline components such as MutationCounter.
+
+"""
+
+# --------------------------------------------------------------------- #
+#  This file is a part of ShapeMapper, and is licensed under the terms  #
+#  of the MIT license. Copyright 2017 Steven Busan.                     #
+# --------------------------------------------------------------------- #
+
+import argparse, sys, os
+
+
+def get_lengths(filename):
+    f = open(filename, "rU")
+    lengths = []
+    for line in f:
+        if line[0] == ">":
+            lengths.append(0)
+        else:
+            lengths[-1] += len(line.strip())
+    return lengths
+
+
+def write_lengths(outnames, lengths):
+    assert len(outnames) == len(lengths)
+    #o = os.path.split(prefix)[0]
+    #if len(o)>0:
+    #    os.makedirs(o, exist_ok=True)
+    for i in range(len(lengths)):
+        oname = outnames[i]
+        o = os.path.split(oname)[0]
+        if len(o)>0:
+            os.makedirs(o, exist_ok=True)
+        #oname = prefix+"l{}".format(i+1)
+        o = open(oname, "w")
+        o.write(str(lengths[i]))
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+
+    h = "Input fasta file"
+    parser.add_argument("--fa", help=h, required=True, type=str)
+
+    #h = 'Output prefix (will be followed by "l1", "l2", etc.)'
+    #parser.add_argument("--prefix", help=h, required=False, type=str)
+
+    h = 'Explicit output filenames (assumed same order as sequences in fasta)'
+    parser.add_argument("--out", help=h, required=True, type=str, nargs="+")
+
+    p = parser.parse_args(sys.argv[1:])
+
+    lengths = get_lengths(p.fa)
+    write_lengths(p.out, lengths)