Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test coverage 100% #13

Merged
merged 37 commits into from
Aug 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
785d286
ref allele of "?" shouldn't validate
afrubin Jul 20, 2020
97eb044
use abstract base classes
afrubin Jul 20, 2020
f4defc4
fix # pragma statements
afrubin Jul 21, 2020
4d53e7e
test additional exceptions
afrubin Jul 21, 2020
8826794
remove deleted test cases
afrubin Jul 21, 2020
964343b
disable logging output for tests
afrubin Jul 21, 2020
1fcecf8
couldn't figure out how to raise this in tests
afrubin Jul 21, 2020
bbeb3fc
add permission error mocks
afrubin Jul 21, 2020
a25e2ae
black formatting
afrubin Jul 22, 2020
c5a44c7
use base class temp directory
afrubin Jul 22, 2020
fb145f9
fix up imports
afrubin Jul 22, 2020
437c2c5
use fqfa sequence validation
afrubin Jul 22, 2020
b1ceb83
parsers don't need to know variant format
afrubin Jul 22, 2020
d34c15c
check for support of noncoding datasets
afrubin Jul 22, 2020
fd743cc
remove unused import
afrubin Jul 22, 2020
7760374
always drop na inplace
afrubin Jul 22, 2020
a589a88
remove self.bin from tests
afrubin Jul 22, 2020
5f31f97
black formatting
afrubin Jul 22, 2020
cc87948
move tests out of module
afrubin Jul 22, 2020
ce1324d
use default temp location
afrubin Jul 22, 2020
4b0fb6b
drop removed modules
afrubin Jul 27, 2020
e275230
unused import
afrubin Jul 27, 2020
a7c6d5b
wrote generic hgvs uniqueness check
afrubin Aug 19, 2020
a662a60
remove tests for zero-based protein variants
afrubin Aug 21, 2020
ba43e9e
add coverage config file
afrubin Aug 21, 2020
5affb40
update tests for hgvs uniqueness
afrubin Aug 21, 2020
bcfb10a
fix typo
afrubin Aug 21, 2020
72f6911
remove unnecessary check for special variants
afrubin Aug 21, 2020
d3edc9e
unknown wild-type amino acid shouldn't validate
afrubin Aug 21, 2020
c61f297
more test cleanup
afrubin Aug 21, 2020
18404b1
linting and dead code removal
afrubin Aug 21, 2020
c1a37f6
test using convert() method
afrubin Aug 21, 2020
5bbeea7
full test coverage for enrich and empiric
afrubin Aug 21, 2020
73ae1fe
add tests and fix edge case with special variants
afrubin Aug 21, 2020
ce85292
split huge parsing test into a new file
afrubin Aug 21, 2020
f891746
full test coverage but enrich2 tsv needs work
afrubin Aug 21, 2020
b0345dc
Merge branch 'main' into test-coverage
afrubin Aug 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[run]
branch = True
omit =
mavedbconvert/main.py
4 changes: 0 additions & 4 deletions mavedbconvert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import os
import sys
import tempfile
import logging.config

__all__ = [
"tests",
"base",
"constants",
"empiric",
"enrich",
"enrich2",
"exceptions",
"fasta",
"utilities",
"filters",
"validators",
"disable_logging",
"LOGGER",
]

Expand Down
23 changes: 5 additions & 18 deletions mavedbconvert/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,6 @@ def input_is_tsv(self):
def input_is_scores_based(self):
return self.input_type == constants.score_type

@property
def input_is_counts_based(self):
return self.input_type == constants.count_type

@property
def output_directory(self):
return os.path.normpath(os.path.expanduser(self.dst))
Expand Down Expand Up @@ -278,36 +274,27 @@ def validate_against_protein_sequence(self, variant):
return

variant = utilities.ProteinSubstitutionEvent(variant)
zero_based_pos = variant.position - int(self.one_based)
if zero_based_pos < 0:
raise IndexError(
(
"Encountered a negative position in {} with one_based "
"set as {}. Positions might not be one-based."
).format(variant, self.one_based)
)

if zero_based_pos >= len(self.protein_sequence):
if variant.position > len(self.protein_sequence):
raise IndexError(
"Position {} (index {}) in {} "
"Position {} in {} "
"extends beyond the maximum index {} in the translated "
"wild-type sequence {} with length {}.".format(
zero_based_pos + int(self.one_based),
zero_based_pos,
variant.position,
variant,
len(self.protein_sequence) - 1,
self.protein_sequence,
len(self.protein_sequence),
)
)

wt_aa = AA_CODES[self.protein_sequence[zero_based_pos]]
wt_aa = AA_CODES[self.protein_sequence[variant.position - 1]]
if variant.ref != wt_aa:
raise ValueError(
"Reference AA '{aa}' at 1-based position {pos} in the "
"translated protein sequence {seq} does not match the "
"reference AA '{ref}' suggested in variant '{variant}'.".format(
pos=zero_based_pos + 1,
pos=variant.position,
aa=wt_aa,
variant=variant,
ref=variant.ref,
Expand Down
2 changes: 2 additions & 0 deletions mavedbconvert/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re

MAX_ERROR_VARIANTS = 5

supported_programs = ("enrich", "enrich2", "empiric")
extra_na = (
"None",
Expand Down
32 changes: 18 additions & 14 deletions mavedbconvert/empiric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from fqfa.constants.translation.table import CODON_TABLE
from fqfa.constants.iupac.protein import AA_CODES
from xlrd.biffh import XLRDError

from . import base, utilities, constants, filters, validators, LOGGER

Expand Down Expand Up @@ -75,12 +76,9 @@ def infer_pro_substitution(wt_aa, mut_aa, codon_pos):
The HGVS-formatted subsitution event.
"""

# Normalize ? to X and ??? to Xaa
if wt_aa in ("?", "???"):
wt_aa = "Xaa"
else:
wt_aa = AA_CODES[wt_aa.upper()]
wt_aa = AA_CODES[wt_aa.upper()]

# Normalize ? to X and ??? to Xaa
if mut_aa in ("?", "???"):
mut_aa = "Xaa"
else:
Expand Down Expand Up @@ -165,14 +163,18 @@ def load_input_file(self):
logger.info("Skipping last {} row(s).".format(self.skip_footer_rows + 1))

if self.extension in (".xlsx", ".xls"):
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
sheet_name=self.sheet_name,
)
if not self.sheet_name:
try:
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
except XLRDError:
raise ValueError(f"invalid Excel sheet name '{self.sheet_name}'")

if self.sheet_name is None:
self.sheet_name = list(od.keys())[0]
if len(od) > 1:
logger.warning(
Expand All @@ -182,7 +184,9 @@ def load_input_file(self):
", ".join(list(od.keys())), self.sheet_name
)
)
df = od[self.sheet_name]
df = od[self.sheet_name]
else:
df = od
else:
sep = "\t"
if self.ext.lower() == ".csv":
Expand Down
50 changes: 25 additions & 25 deletions mavedbconvert/enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
import numpy as np
from fqfa.constants.iupac.protein import AA_CODES

from xlrd.biffh import XLRDError

from . import LOGGER, constants, base, utilities, filters, validators

Expand Down Expand Up @@ -53,12 +53,6 @@ def __init__(
if not is_coding:
raise ValueError("Enrich does not support non-coding datasets.")

if not self.score_column and self.input_type == constants.score_type:
raise ValueError(
"A score column must be specified if "
"the input file is a scores file."
)

def load_input_file(self):
"""
Loads the input file specified at initialization into a dataframe.
Expand All @@ -73,14 +67,18 @@ def load_input_file(self):
logger.info("Skipping last {} row(s).".format(self.skip_footer_rows + 1))

if self.extension in (".xlsx", ".xls"):
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
if not self.sheet_name:
try:
od = pd.read_excel(
self.src,
na_values=constants.extra_na,
sheet_name=self.sheet_name,
skiprows=self.skip_header_rows,
skipfooter=self.skip_footer_rows,
)
except XLRDError:
raise ValueError(f"invalid Excel sheet name '{self.sheet_name}'")

if self.sheet_name is None:
self.sheet_name = list(od.keys())[0]
if len(od) > 1:
logger.warning(
Expand All @@ -90,7 +88,9 @@ def load_input_file(self):
", ".join(list(od.keys())), self.sheet_name
)
)
df = od[self.sheet_name]
df = od[self.sheet_name]
else:
df = od
else:
sep = "\t"
if self.ext.lower() == ".csv":
Expand Down Expand Up @@ -129,6 +129,8 @@ def parse_row(self, row):
raise ValueError("'{}' is a malformed SeqID.".format(seq_id))

positions, aa_codes = seq_id.split("-")
if len(positions) == 0 or len(aa_codes) == 0:
raise ValueError("'{}' is a malformed SeqID.".format(seq_id))
positions = positions.split(",")
aa_codes = aa_codes.split(",")
events = []
Expand Down Expand Up @@ -179,18 +181,17 @@ def parse_row(self, row):
if aa == "?":
mut_aa = "???"
else:
mut_aa = AA_CODES[aa.upper()]
try:
mut_aa = AA_CODES[aa.upper()]
except KeyError as e:
raise KeyError(f"Invalid amino acid {e} in '{seq_id}'")
if wt_aa == mut_aa:
events.append("{wt}{pos}=".format(wt=wt_aa, pos=aa_position))
else:
events.append(
"{wt}{pos}{mut}".format(wt=wt_aa, pos=aa_position, mut=mut_aa)
)

if len(events) == 0:
raise ValueError(
"Could not parse any variant strings from {}".format(seq_id)
)
return utilities.hgvs_pro_from_event_list(events)

def parse_input(self, df):
Expand Down Expand Up @@ -254,10 +255,9 @@ def parse_input(self, df):
data[column] = list(utilities.format_column(column_values, astype))

# Sort column order so 'score' comes right after hgvs columns.
if self.input_is_scores_based:
mave_columns = (
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mave_columns = (
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mavedb_df = pd.DataFrame(data=data, columns=mave_columns)
filters.drop_na_rows(mavedb_df)
filters.drop_na_columns(mavedb_df)
Expand Down
Loading