Skip to content

Commit

Permalink
Merge pull request #70 from VDBWRAIR/dev
Browse files Browse the repository at this point in the history
Next release
  • Loading branch information
averagehat committed Jan 4, 2016
2 parents 2a110c2 + b61a578 commit 5316608
Show file tree
Hide file tree
Showing 71 changed files with 2,807 additions and 126 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ report.html

# Sphinx documentation
docs/_build/
docs/_static/
docs/_templates/

# PyBuilder
Expand Down
9 changes: 2 additions & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,7 @@ install:
- pip install -r tests/requirements.txt
- python setup.py develop
script:
- nosetests tests --with-coverage --cover-erase --cover-package=bio_pieces
- nosetests tests --with-coverage --cover-erase --cover-package=bio_bits -a '!download'
- pybot tests/*.robot
after_success:
- coveralls

notifications:
email:
recipients:
- michael.panciera.work@gmail.com
- coveralls
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
CHANGELOG
=========

Version 1.2.0
-------------

* Renamed project to bio_bits to fix naming issue with other project
* GPL License added
* degen_regions script added
* parallel_blast added
* plot_muts script added

Version 1.1.0
-------------

Expand Down
17 changes: 17 additions & 0 deletions COPYRIGHT
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
bio_bits is collection of bioinformatics scripts and libraries written in Python
Copyright (C) 2015 Cherokee Nation Technology Solutions, LLC


This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
32 changes: 32 additions & 0 deletions Den4_MAAPS_TestData16.fasta

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
bio_pieces
bio_bits
==========

.. image:: https://readthedocs.org/projects/bio-pieces/badge/?version=latest
:target: http://bio-pieces.readthedocs.org/en/latest/
:alt: Documentation Status

.. image:: https://travis-ci.org/VDBWRAIR/bio_pieces.svg
:target: https://travis-ci.org/VDBWRAIR/bio_pieces
.. image:: https://travis-ci.org/VDBWRAIR/bio_bits.svg
:target: https://travis-ci.org/VDBWRAIR/bio_bits

.. image:: https://coveralls.io/repos/VDBWRAIR/bio_pieces/badge.svg
:target: https://coveralls.io/r/VDBWRAIR/bio_pieces
.. image:: https://coveralls.io/repos/VDBWRAIR/bio_bits/badge.svg
:target: https://coveralls.io/r/VDBWRAIR/bio_bits

Various bioinformatics scripts

Expand Down
6 changes: 6 additions & 0 deletions bio_bits/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__version__ = '1.2.0'
__release__ = __version__
__authors__ = 'Tyghe Vallard, Michael Panciera'
__authoremails__ = 'vallardt@gmail.com, michael.panciera.work@gmail.com'
__description__ = 'bio_bits is a collection of bioinformatics scripts and libraries written in Python. The goal of the project is to make common bioinformatics tasks easier to perform and to provide useful libraries to for inclusion in more complex bioinformatics libraries and scripts.'
__projectname__ = 'bio_bits'
File renamed without changes.
2 changes: 1 addition & 1 deletion bio_pieces/amos2fastq.py → bio_bits/amos2fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from Bio import SeqIO
import itertools
import pandas as pd
from bio_pieces import amos
from bio_bits import amos
''' Python3 compatibility '''
from past.builtins import map , filter

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
'''
from schema import Schema, Use, And
from docopt import docopt
from bio_pieces import amos2fastq
from bio_bits import amos2fastq
#Do file validation immediately when script is started
def all_elemnts_unique(collection):
return len(collection) == len(set(collection))
Expand Down
File renamed without changes.
File renamed without changes.
36 changes: 36 additions & 0 deletions bio_bits/compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
try:
from StringIO import StringIO
except ImportError:
from io import StringIO

try:
from BytesIO import BytesIO
except ImportError:
from io import BytesIO

from future.builtins import map, filter, zip

try:
import unittest2 as unittest
except ImportError:
import unittest


try:
from functools import reduce
except:
pass

try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict

try:
from __builtin__ import open
except ImportError:
from builtins import open

# Tests directory
from os.path import dirname
THIS = dirname(__file__)
240 changes: 240 additions & 0 deletions bio_bits/ctleptop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
#!/usr/bin/env python
# encoding: utf-8
"""
ctleptop.py -i [FASTA FILE] > Out_file.txt
Created by Dereje Jima on May 21, 2015
"""
from __future__ import division
from __future__ import print_function
from Bio.Seq import *
from Bio.Alphabet import IUPAC
from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
#from itertools import groupby
from Bio.Data import CodonTable
from Bio.Data.IUPACData import ambiguous_dna_values
#import yaml
import argparse
from bio_bits import degen
from functools import partial
from tabulate import tabulate
from bio_bits.compat import zip
import re
import sys

__docformat__ = "restructuredtext en"

AMBICODON = {"R": ["A", "G"], "Y": ["C", "T"],
"W": ["A", "T"], "S": ["G", "C"],
"K": ["T", "G"],
"M": ["C", "A"], "D": ["A", "T", "G"],
"V": ["A", "C", "G"], "H": ["A", "C", "T"],
"B": ["C", "G", "T"], "N": ["A", "C", "T", "G"]}

def getNearbyChars(nt):
"""(str)->(list)
>>>getNearbyChars("R")
['A', 'G']
>>>getNearbyChars("Y")
['C', 'T']
>>>getNearbyChars("A")
['A']
"""
return AMBICODON.get(nt) or nt

def nearbyPermutations(letters, index=0):
"""(str)->(set)
>>>nearbyPermutations("AAR")
set(['AAG', 'AAA'])
>>>nearbyPermutations("ARR")
set(['AGG', 'AAG', 'AAA', 'AGA'])
nearbyPermutations("AAA")
set(['AAA'])
"""
if (index >= len(letters)):
return set([''])
subWords = nearbyPermutations(letters, index + 1)
nearbyLetters = getNearbyChars(letters[index])
return permutations(subWords, nearbyLetters)

def permutations(subWords, nearbyLetters):
"""(set, list) -> (set)
>>>permutations(set(['CA']), ['A', 'T'])
set(['ACA', 'TCA'])
"""
permutations = set()
for subWord in subWords:
for letter in nearbyLetters:
permutations.add(letter + subWord)
return permutations

def getaalist(codonlist):
"""(list) -> (list)
Return aa list from a a given nt codon list.
>>>getaalist(['AAA','ACT'])
['K', 'T']
"""
aalist = []
for codon in codonlist:
aa = Seq(codon, IUPAC.unambiguous_dna)
aa = str(translate(aa))
aalist.append(aa)
return aalist

def list_overlap(list1, list2):
"""(str, list) -> bool
Return True if the two list hava element that overlaps.
>>>list_overlap('RAC',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
True
>>>list_overlap('ACT',['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y'])
False
"""
for i in list1:
if i in list2:
return True
return False

def access_mixed_aa(file_name):
"""(str) ->(list,list,list,list).
Return a list of amino acide code for ambiguous dna codon, position of
ambiguous nt codon, aa name,seq id from fasta header by reading multifasta
nucleotide fasta file
"""
from Bio import SeqIO
aa = []
nucleotide_idx = []
nucl_codon = []
seqids = []
for seq_record in SeqIO.parse(file_name, 'fasta'):
seq_id = seq_record.id
seqline = str(seq_record.seq)
seqline = seqline.replace("-", "N")
n = 3
codon_list = dict( (i + n , seqline[i:i + n]) for i in range(0, len(seqline), n))
ambi_nucl = AMBICODON.keys()
for key, codon in sorted(codon_list.items()):
if list_overlap(codon, ambi_nucl):
d, e, f = codon
m = [d, e, f]
items = [i for i in m if i in ambi_nucl]
indexm = m.index(items[0])
for idx, val in enumerate(items):
codonlist = list(nearbyPermutations(codon))
val = getaalist(codonlist)
# remove if aa codon is the same eg. ['D', 'D']
val = set(val)
val = "/".join(sorted(val)) # yeild 'I/L'

key = key - 2 + indexm
if '/' in val:
nucleotide_idx.append(key)
nucl_codon.append(codon)
seqids.append(seq_id)
# if "/" in val and indexm == 2:
# key = key
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 1:
# key = key - 1
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# elif "/" in val and indexm == 0:
# key = key - 2
# nucleotide_idx.append(key)
# nucl_codon.append(codon)
# seqids.append(seq_id)
# else:
# pass
aa.append(val)

else:
# print "codon3 ..." ,codon
aa1 = Seq(codon, IUPAC.unambiguous_dna)
aa1 = aa1.translate()
aa1 = str(aa1)
aa.append(aa1)
#print aa, nucleotide_idx, nucl_codon, seqids
return aa, nucleotide_idx, nucl_codon, seqids


def create_args():
"""
Return command line arguments
"""
parser = argparse.ArgumentParser(
description='Convert inframe nucleotide \
fasta file to protein and report mixed \
(ambiguous codon) with its location in \
the sequence',
epilog = '%(prog)s -i tests/Den4_MAAPS_TestData16.fasta -o out_file.txt'
)
g = parser.add_mutually_exclusive_group(required=True)
parser.add_argument("-i", type=str, help="Nucleotide fasta file", required=True)
parser.add_argument("-o", type=str, help="output file name", required=True)
g.add_argument("--gb-file", type=str, help="genbank file name")
g.add_argument("--gb-id", type=str, help="genabnk accession id")
g.add_argument("--tab-file", type=str, help="gene tab/csv file")
parser.add_argument('--cds', type=str, help="CDS start stop[start,stop]")
return parser.parse_args()

def mod_entry(entry, cds):
'''
Find Gap positions and non-coding region positions
:param entry: iterable of (seqid,nucindex,aaindex,nuclcodon,aacodon,genename)
:cds: Gene of CDS info
:return: entry modified to reflect gap or non-coding
'''
new_entry = list(entry)
nuc_pos = entry[1]
nt = entry[3]
if cds.start >= nuc_pos or cds.end <= nuc_pos:
new_entry[4] = 'NON-CODING'
elif 'N' in nt:
new_entry[4] = 'GAPFOUND'
return tuple(new_entry)

def main():
args = create_args()
file_name = args.i
outfile = args.o

with open(outfile, 'w+') as outf:
aa, nuc_idx, nucl_codon, seqids = access_mixed_aa(file_name)

# Get Gene info
reference_genes, cds = degen.get_genes(args.gb_id, args.gb_file, args.tab_file)
overlapped_genes = degen.get_degen_list_overlap(reference_genes, nuc_idx)

# Remove all non-mixed positions
amb_aa_codon = filter(lambda x: '/' in x, aa)
# get amino acid index list
amb_aa_indx = map(lambda x: x//3 + 1, nuc_idx)

mixed_positions = zip(seqids, nuc_idx, amb_aa_indx, nucl_codon, amb_aa_codon, overlapped_genes)
if args.cds:
cds_start, cds_end = map(int, args.cds.split(','))
cds = degen.Gene('CDS', cds_start, cds_end)

if cds is None:
print("No CDS information supplied via input file or on command line")
sys.exit(1)

# mark gaps and non-coding positions
mixed_positions= map(lambda x: mod_entry(x, cds), mixed_positions)
outf.write(
tabulate(
mixed_positions,
headers=[
'seq id', 'nt Position', 'aa position',
'nt composition', 'aa composition', 'gene name'
]
) + "\n"
)

if __name__ == '__main__':
main()

0 comments on commit 5316608

Please sign in to comment.