Skip to content

Commit

Permalink
test coverage 100% (#13)
Browse files Browse the repository at this point in the history
* ref allele of "?" shouldn't validate

* use abstract base classes

* fix # pragma statements

* test additional exceptions

* remove deleted test cases

* disable logging output for tests

* couldn't figure out how to raise this in tests

* add permission error mocks

* black formatting

* use base class temp directory

* fix up imports

* use fqfa sequence validation

* parsers don't need to know variant format

* check for support of noncoding datasets

* remove unused import

* always drop na inplace

* remove self.bin from tests

* black formatting

* move tests out of module

* use default temp location

* drop removed modules

* unused import

* wrote generic hgvs uniqueness check

* remove tests for zero-based protein variants

These variants were not being generated by any module code, since utilities.ProteinSubstitutionEvent generates one-based positions.

* add coverage config file

* update tests for hgvs uniqueness

* fix typo

* remove unnecessary check for special variants

* unknown wild-type amino acid shouldn't validate

* more test cleanup

* linting and dead code removal

* test using convert() method

* full test coverage for enrich and empiric

* add tests and fix edge case with special variants

* split huge parsing test into a new file

* full test coverage but enrich2 tsv needs work
  • Loading branch information
afrubin committed Aug 21, 2020
1 parent 17f31c3 commit e0eb763
Show file tree
Hide file tree
Showing 21 changed files with 1,047 additions and 822 deletions.
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[run]
branch = True
omit =
mavedbconvert/main.py
4 changes: 0 additions & 4 deletions mavedbconvert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import os
import sys
import tempfile
import logging.config

__all__ = [
"tests",
"base",
"constants",
"empiric",
"enrich",
"enrich2",
"exceptions",
"fasta",
"utilities",
"filters",
"validators",
"disable_logging",
"LOGGER",
]

Expand Down
23 changes: 5 additions & 18 deletions mavedbconvert/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,6 @@ def input_is_tsv(self):
def input_is_scores_based(self):
return self.input_type == constants.score_type

@property
def input_is_counts_based(self):
return self.input_type == constants.count_type

@property
def output_directory(self):
return os.path.normpath(os.path.expanduser(self.dst))
Expand Down Expand Up @@ -278,36 +274,27 @@ def validate_against_protein_sequence(self, variant):
return

variant = utilities.ProteinSubstitutionEvent(variant)
zero_based_pos = variant.position - int(self.one_based)
if zero_based_pos < 0:
raise IndexError(
(
"Encountered a negative position in {} with one_based "
"set as {}. Positions might not be one-based."
).format(variant, self.one_based)
)

if zero_based_pos >= len(self.protein_sequence):
if variant.position > len(self.protein_sequence):
raise IndexError(
"Position {} (index {}) in {} "
"Position {} in {} "
"extends beyond the maximum index {} in the translated "
"wild-type sequence {} with length {}.".format(
zero_based_pos + int(self.one_based),
zero_based_pos,
variant.position,
variant,
len(self.protein_sequence) - 1,
self.protein_sequence,
len(self.protein_sequence),
)
)

wt_aa = AA_CODES[self.protein_sequence[zero_based_pos]]
wt_aa = AA_CODES[self.protein_sequence[variant.position - 1]]
if variant.ref != wt_aa:
raise ValueError(
"Reference AA '{aa}' at 1-based position {pos} in the "
"translated protein sequence {seq} does not match the "
"reference AA '{ref}' suggested in variant '{variant}'.".format(
pos=zero_based_pos + 1,
pos=variant.position,
aa=wt_aa,
variant=variant,
ref=variant.ref,
Expand Down
2 changes: 2 additions & 0 deletions mavedbconvert/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re

MAX_ERROR_VARIANTS = 5

supported_programs = ("enrich", "enrich2", "empiric")
extra_na = (
"None",
Expand Down
32 changes: 18 additions & 14 deletions mavedbconvert/empiric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from fqfa.constants.translation.table import CODON_TABLE
from fqfa.constants.iupac.protein import AA_CODES
from xlrd.biffh import XLRDError

from . import base, utilities, constants, filters, validators, LOGGER

Expand Down Expand Up @@ -75,12 +76,9 @@ def infer_pro_substitution(wt_aa, mut_aa, codon_pos):
The HGVS-formatted subsitution event.
"""

# Normalize ? to X and ??? to Xaa
if wt_aa in ("?", "???"):
wt_aa = "Xaa"
else:
wt_aa = AA_CODES[wt_aa.upper()]
wt_aa = AA_CODES[wt_aa.upper()]

# Normalize ? to X and ??? to Xaa
if mut_aa in ("?", "???"):
mut_aa = "Xaa"
else:
Expand Down Expand Up @@ -165,14 +163,18 @@ def load_input_file(self):
logger.info("Skipping last {} row(s).".format(self.skip_footer_rows + 1))

if self.extension in (".xlsx", ".xls"):
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
sheet_name=self.sheet_name,
)
if not self.sheet_name:
try:
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
except XLRDError:
raise ValueError(f"invalid Excel sheet name '{self.sheet_name}'")

if self.sheet_name is None:
self.sheet_name = list(od.keys())[0]
if len(od) > 1:
logger.warning(
Expand All @@ -182,7 +184,9 @@ def load_input_file(self):
", ".join(list(od.keys())), self.sheet_name
)
)
df = od[self.sheet_name]
df = od[self.sheet_name]
else:
df = od
else:
sep = "\t"
if self.ext.lower() == ".csv":
Expand Down
50 changes: 25 additions & 25 deletions mavedbconvert/enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
import numpy as np
from fqfa.constants.iupac.protein import AA_CODES

from xlrd.biffh import XLRDError

from . import LOGGER, constants, base, utilities, filters, validators

Expand Down Expand Up @@ -53,12 +53,6 @@ def __init__(
if not is_coding:
raise ValueError("Enrich does not support non-coding datasets.")

if not self.score_column and self.input_type == constants.score_type:
raise ValueError(
"A score column must be specified if "
"the input file is a scores file."
)

def load_input_file(self):
"""
Loads the input file specified at initialization into a dataframe.
Expand All @@ -73,14 +67,18 @@ def load_input_file(self):
logger.info("Skipping last {} row(s).".format(self.skip_footer_rows + 1))

if self.extension in (".xlsx", ".xls"):
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
if not self.sheet_name:
try:
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
except XLRDError:
raise ValueError(f"invalid Excel sheet name '{self.sheet_name}'")

if self.sheet_name is None:
self.sheet_name = list(od.keys())[0]
if len(od) > 1:
logger.warning(
Expand All @@ -90,7 +88,9 @@ def load_input_file(self):
", ".join(list(od.keys())), self.sheet_name
)
)
df = od[self.sheet_name]
df = od[self.sheet_name]
else:
df = od
else:
sep = "\t"
if self.ext.lower() == ".csv":
Expand Down Expand Up @@ -129,6 +129,8 @@ def parse_row(self, row):
raise ValueError("'{}' is a malformed SeqID.".format(seq_id))

positions, aa_codes = seq_id.split("-")
if len(positions) == 0 or len(aa_codes) == 0:
raise ValueError("'{}' is a malformed SeqID.".format(seq_id))
positions = positions.split(",")
aa_codes = aa_codes.split(",")
events = []
Expand Down Expand Up @@ -179,18 +181,17 @@ def parse_row(self, row):
if aa == "?":
mut_aa = "???"
else:
mut_aa = AA_CODES[aa.upper()]
try:
mut_aa = AA_CODES[aa.upper()]
except KeyError as e:
raise KeyError(f"Invalid amino acid {e} in '{seq_id}'")
if wt_aa == mut_aa:
events.append("{wt}{pos}=".format(wt=wt_aa, pos=aa_position))
else:
events.append(
"{wt}{pos}{mut}".format(wt=wt_aa, pos=aa_position, mut=mut_aa)
)

if len(events) == 0:
raise ValueError(
"Could not parse any variant strings from {}".format(seq_id)
)
return utilities.hgvs_pro_from_event_list(events)

def parse_input(self, df):
Expand Down Expand Up @@ -254,10 +255,9 @@ def parse_input(self, df):
data[column] = list(utilities.format_column(column_values, astype))

# Sort column order so 'score' comes right after hgvs columns.
if self.input_is_scores_based:
mave_columns = (
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mave_columns = (
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mavedb_df = pd.DataFrame(data=data, columns=mave_columns)
filters.drop_na_rows(mavedb_df)
filters.drop_na_columns(mavedb_df)
Expand Down
Loading

0 comments on commit e0eb763

Please sign in to comment.