From ea59e8a3b5b4d7a6011c49637135eccfed9771cc Mon Sep 17 00:00:00 2001 From: carlosp420 Date: Fri, 20 Mar 2015 15:30:59 +0200 Subject: [PATCH] refactor number of genes for taxa --- voseq/create_dataset/dataset.py | 78 ++++++++++++--------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/voseq/create_dataset/dataset.py b/voseq/create_dataset/dataset.py index a5222da6..ea04d374 100644 --- a/voseq/create_dataset/dataset.py +++ b/voseq/create_dataset/dataset.py @@ -38,7 +38,7 @@ def get_number_chars_from_partition_list(self, partitions): gene_code = item.strip().replace('[', '').replace(']', '') continue if self.file_format == 'TNT': - gene_code = "dummy" + str(i) + gene_code = 'dummy' + str(i) i += 1 continue if gene_code != '': @@ -54,55 +54,33 @@ def get_number_of_genes_for_taxa(self, partitions): number_of_genes_for_taxa = dict() vouchers_to_drop = set() - if self.file_format == 'NEXUS': - gene_code = '' - for item in partitions[0]: - if item.startswith('\n'): - gene_code = item.strip().replace('[', '').replace(']', '') - continue - if gene_code != '': - entry = re.sub('\s+', ' ', item) - voucher, sequence = entry.split(' ') - - if voucher not in number_of_genes_for_taxa: - number_of_genes_for_taxa[voucher] = 0 - - sequence = sequence.replace('?', '') - if sequence != '': - number_of_genes_for_taxa[voucher] += 1 - - if self.minimum_number_of_genes is None: - self.vouchers_to_drop = [] - else: - for voucher in number_of_genes_for_taxa: - if number_of_genes_for_taxa[voucher] < self.minimum_number_of_genes: - vouchers_to_drop.add(voucher) - self.vouchers_to_drop = vouchers_to_drop - - if self.file_format == 'TNT': - gene_code = '' - for item in partitions[0]: - if item.startswith('\n'): - gene_code = 'dummy' - continue - if gene_code != '': - entry = re.sub('\s+', ' ', item) - voucher, sequence = entry.split(' ') - - if voucher not in number_of_genes_for_taxa: - number_of_genes_for_taxa[voucher] = 0 - - sequence = sequence.replace('?', '') - if sequence != '': - number_of_genes_for_taxa[voucher] += 1 - - if self.minimum_number_of_genes is None: - self.vouchers_to_drop = [] - else: - for voucher in number_of_genes_for_taxa: - if number_of_genes_for_taxa[voucher] < self.minimum_number_of_genes: - vouchers_to_drop.add(voucher) - self.vouchers_to_drop = vouchers_to_drop + gene_code = '' + for item in partitions[0]: + if item.startswith('\n'): + if self.file_format == 'NEXUS': + gene_code = item.strip().replace('[', '').replace(']', '') + continue + if self.file_format == 'TNT': + gene_code = 'dummy' + continue + if gene_code != '': + entry = re.sub('\s+', ' ', item) + voucher, sequence = entry.split(' ') + + if voucher not in number_of_genes_for_taxa: + number_of_genes_for_taxa[voucher] = 0 + + sequence = sequence.replace('?', '') + if sequence != '': + number_of_genes_for_taxa[voucher] += 1 + + if self.minimum_number_of_genes is None: + self.vouchers_to_drop = [] + else: + for voucher in number_of_genes_for_taxa: + if number_of_genes_for_taxa[voucher] < self.minimum_number_of_genes: + vouchers_to_drop.add(voucher) + self.vouchers_to_drop = vouchers_to_drop def get_reading_frames(self): """