From c25a13713c0efa4928f39bd637c6f4e681cd1339 Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Sat, 6 Jan 2018 22:17:00 -0500 Subject: [PATCH 1/2] support lienages for genbank fasta files --- README.rst | 14 ++++++++++++++ requirements/base.txt | 4 ++-- voseq/create_dataset/utils.py | 36 ++++++++++++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 256e77a0..9e939710 100644 --- a/README.rst +++ b/README.rst @@ -582,3 +582,17 @@ and place them in the ``config.json`` configuration file of VoSeq: * Save and exit. Thus, every picture that you upload into your VoSeq installation will be uploaded into your Flickr account. + +Lineages +======== + +GenBank fasta files have the optional field Lineages. You can add lineages in the form: + +LINEAGES = { + # superfamily: lineage from domain Eukaryota to suborder Ditrysia + "Papilionoidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", + "Hesperioidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", + "Hedyloidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", +} + +by editing the file `voseq/create_dataset/utils.py`. diff --git a/requirements/base.txt b/requirements/base.txt index 0c2fc044..358a397c 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -16,6 +16,6 @@ django-haystack==2.5.0 easy-thumbnails==2.2 flickrapi==2.0 -dataset-creator==0.3.17 -seqrecord-expanded==0.2.8 +dataset-creator==0.3.18 +seqrecord-expanded==0.2.9 degenerate-dna==0.0.9 diff --git a/voseq/create_dataset/utils.py b/voseq/create_dataset/utils.py index 0ecd2f4c..da5834c5 100644 --- a/voseq/create_dataset/utils.py +++ b/voseq/create_dataset/utils.py @@ -1,3 +1,5 @@ +import re + from seqrecord_expanded import SeqRecordExpanded from seqrecord_expanded.exceptions import MissingParameterError from seqrecord_expanded.exceptions import TranslationErrorMixedGappedSeq @@ -13,6 +15,13 @@ from public_interface.models import Sequences from public_interface.models import Vouchers +LINEAGES = { + # superfamily: lineage from domain Eukaryota to suborder Ditrysia + "Papilionoidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", # noqa + "Hesperioidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", # noqa + "Hedyloidea": "Eukaryota; Metazoa; Ecdysozoa; Arthropoda; Hexapoda; Insecta; Pterygota; Neoptera; Holometabola; Lepidoptera; Glossata; Ditrysia; ", # noqa +} + class CreateDataset(object): """Accepts form input to create a dataset in several formats. @@ -171,6 +180,7 @@ def build_seq_obj(self, code, gene_code, our_taxon_names, all_seqs): seq = self.create_seq_record(this_voucher_seqs) if code in our_taxon_names: + lineage = self.get_lineage(code) seq_record = SeqRecordExpanded( seq, voucher_code=code.replace(" ", "_"), @@ -178,22 +188,41 @@ def build_seq_obj(self, code, gene_code, our_taxon_names, all_seqs): gene_code=gene_code, reading_frame=self.gene_codes_metadata[gene_code]['reading_frame'], table=self.gene_codes_metadata[gene_code]['genetic_code'], + lineage=lineage, ) return seq_record else: return None + def get_lineage(self, code): + voucher = Vouchers.objects.get(code=code) + try: + lineage = LINEAGES[voucher.superfamily] + except KeyError: + lineage = "" + + additional_lineage = ";".join([ + voucher.family, voucher.subfamily, voucher.tribe, voucher.subtribe, + voucher.genus, voucher.species, voucher.subspecies, + ]) + lineage += re.sub(";+", "; ", additional_lineage) + return lineage.strip() + def extract_sequence_from_all_seqs_in_db(self, all_seqs, code, gene_code): try: voucher_sequences = all_seqs[code] except KeyError: - self.warnings += ['Could not find sequences for voucher {0} and gene_code {1}'.format(code, gene_code)] + self.warnings += [ + 'Could not find sequences for voucher {0} and gene_code {1}'.format( + code, gene_code)] return '?' try: this_voucher_seqs = voucher_sequences[gene_code] except KeyError: - self.warnings += ['Could not find sequences for voucher {0} and gene_code {1}'.format(code, gene_code)] + self.warnings += [ + 'Could not find sequences for voucher {0} and gene_code {1}'.format( + code, gene_code)] return '?' return this_voucher_seqs @@ -242,7 +271,8 @@ def get_gene_codes_metadata(self): """ :return: dictionary with genecode and base pair number. """ - queryset = Genes.objects.all().values('gene_code', 'length', 'reading_frame', 'genetic_code') + queryset = Genes.objects.all().values( + 'gene_code', 'length', 'reading_frame', 'genetic_code') gene_codes_metadata = dict() for i in queryset: gene_code = i['gene_code'] From dafbd11076696150927987eda06899dc63a485d4 Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Sat, 6 Jan 2018 22:44:03 -0500 Subject: [PATCH 2/2] upgrade dataset creator --- requirements/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/base.txt b/requirements/base.txt index 358a397c..92a41300 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -16,6 +16,6 @@ django-haystack==2.5.0 easy-thumbnails==2.2 flickrapi==2.0 -dataset-creator==0.3.18 +dataset-creator==0.3.19 seqrecord-expanded==0.2.9 degenerate-dna==0.0.9