Skip to content

Commit

Permalink
code cleanup and test improvements (#11)
Browse files Browse the repository at this point in the history
* ref allele of "?" shouldn't validate

* use abstract base classes

* fix # pragma statements

* test additional exceptions

* remove deleted test cases

* disable logging output for tests

* couldn't figure out how to raise this in tests

* add permission error mocks

* black formatting

* use base class temp directory

* fix up imports

* use fqfa sequence validation

* parsers don't need to know variant format

* check for support of noncoding datasets

* remove unused import

* always drop na inplace

* remove self.bin from tests

* black formatting

* move tests out of module

* use default temp location
  • Loading branch information
afrubin committed Jul 22, 2020
1 parent 07bbeaa commit 17f31c3
Show file tree
Hide file tree
Showing 30 changed files with 213 additions and 237 deletions.
13 changes: 1 addition & 12 deletions mavedbconvert/__init__.py
Expand Up @@ -20,9 +20,8 @@
]

HOMEDIR = os.path.normpath(os.path.expanduser("~/.mavedb_convert/"))
tempfile.tempdir = HOMEDIR
if not os.path.isdir(HOMEDIR):
os.mkdir(HOMEDIR)
os.mkdir(HOMEDIR) # pragma: no cover

LOGGER = "mavedbconvert"

Expand Down Expand Up @@ -58,13 +57,3 @@
},
}
)


def disable_logging():
logging.disable(logging.INFO)
logging.disable(logging.WARN)
logging.disable(logging.WARNING)
logging.disable(logging.ERROR)
logging.disable(logging.CRITICAL)
logging.disable(logging.DEBUG)
logging.disable(logging.FATAL)
15 changes: 10 additions & 5 deletions mavedbconvert/base.py
Expand Up @@ -2,9 +2,11 @@
import re
import logging
import numpy as np
from abc import ABCMeta, abstractmethod

from hgvsp import is_multi
from fqfa.constants.iupac.protein import AA_CODES
from fqfa.validator.validator import dna_bases_validator

from . import LOGGER, utilities, constants

Expand All @@ -15,7 +17,7 @@
__all__ = ["BaseProgram"]


class BaseProgram(object):
class BaseProgram(metaclass=ABCMeta):
"""
Convert an input file to MaveDB_ compliant counts or scores files.
Expand Down Expand Up @@ -140,7 +142,7 @@ def wt_sequence(self):
def wt_sequence(self, seq):
seq = str(seq).upper()
# Initialize sequence information.
if not constants.dna_re.fullmatch(seq):
if dna_bases_validator(seq) is None:
raise ValueError("{} is not a valid DNA sequence.".format(seq))
if self.is_coding:
self.protein_sequence = utilities.translate_dna(seq, offset=0)
Expand Down Expand Up @@ -184,14 +186,17 @@ def convert(self):
logger.info("Writing to {}".format(self.output_file))
mave_df.to_csv(self.output_file, sep=",", index=None, na_rep=np.NaN)

@abstractmethod
def load_input_file(self):
raise NotImplementedError()
pass # pragma: no cover

@abstractmethod
def parse_input(self, df):
raise NotImplementedError()
pass # pragma: no cover

@abstractmethod
def parse_row(self, row):
raise NotImplementedError()
pass # pragma: no cover

def validate_against_wt_sequence(self, variant):
"""
Expand Down
1 change: 0 additions & 1 deletion mavedbconvert/constants.py
Expand Up @@ -17,7 +17,6 @@
)
null_value_re = re.compile(r"\s+|nan|na|none|undefined|n/a|null")
surrounding_brackets_re = re.compile(r"\((.*)\)")
dna_re = re.compile(r"[ATCGatcg]+", flags=re.IGNORECASE)

# HGVSP constants
hgvsp_nt_pos = "position"
Expand Down
7 changes: 5 additions & 2 deletions mavedbconvert/empiric.py
Expand Up @@ -139,6 +139,9 @@ def __init__(
if not abs(offset) % 3 == 0:
raise ValueError("EMPIRIC offset must be a multiple of 3.")

if not is_coding:
raise ValueError("Enrich does not support non-coding datasets.")

self.codon_column = None
self.aa_column = None
self.position_column = None
Expand Down Expand Up @@ -334,8 +337,8 @@ def parse_input(self, df):
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mavedb_df = pd.DataFrame(data=data, columns=mave_columns)
filters.drop_na_rows(mavedb_df, inplace=True)
filters.drop_na_columns(mavedb_df, inplace=True)
filters.drop_na_rows(mavedb_df)
filters.drop_na_columns(mavedb_df)

logger.info("Running MaveDB compliance validation.")
validators.validate_mavedb_compliance(mavedb_df, df_type=self.input_type)
Expand Down
7 changes: 5 additions & 2 deletions mavedbconvert/enrich.py
Expand Up @@ -50,6 +50,9 @@ def __init__(
if not abs(offset) % 3 == 0:
raise ValueError("Enrich offset must be a multiple of 3.")

if not is_coding:
raise ValueError("Enrich does not support non-coding datasets.")

if not self.score_column and self.input_type == constants.score_type:
raise ValueError(
"A score column must be specified if "
Expand Down Expand Up @@ -256,8 +259,8 @@ def parse_input(self, df):
mave_columns[:2] + [constants.mavedb_score_column] + mave_columns[2:]
)
mavedb_df = pd.DataFrame(data=data, columns=mave_columns)
filters.drop_na_rows(mavedb_df, inplace=True)
filters.drop_na_columns(mavedb_df, inplace=True)
filters.drop_na_rows(mavedb_df)
filters.drop_na_columns(mavedb_df)

logger.info("Running MaveDB compliance validation.")
validators.validate_mavedb_compliance(mavedb_df, df_type=self.input_type)
Expand Down
8 changes: 4 additions & 4 deletions mavedbconvert/enrich2.py
Expand Up @@ -129,8 +129,8 @@ def drop_null(scores_df, counts_df=None):
sort=False,
)
assert len(joint_df) == len(counts_df)
filters.drop_na_columns(joint_df, inplace=True)
filters.drop_na_rows(joint_df, inplace=True)
filters.drop_na_columns(joint_df)
filters.drop_na_rows(joint_df)

score_columns = list(utilities.hgvs_columns(joint_df.columns)) + list(
utilities.non_hgvs_columns(scores_df.columns)
Expand All @@ -146,8 +146,8 @@ def drop_null(scores_df, counts_df=None):

assert_index_equal(scores_df.index, counts_df.index)
else:
filters.drop_na_columns(scores_df, inplace=True)
filters.drop_na_rows(scores_df, inplace=True)
filters.drop_na_columns(scores_df)
filters.drop_na_rows(scores_df)

return scores_df, counts_df

Expand Down
10 changes: 2 additions & 8 deletions mavedbconvert/filters.py
Expand Up @@ -8,13 +8,10 @@
logger = logging.getLogger(LOGGER)


def drop_na_columns(df, inplace=False):
def drop_na_columns(df):
"""
Drop columns where all entries are null. Operation is performed in place.
"""
if not inplace:
df = utilities.copy_dataframe(df)

has_nt_col = constants.nt_variant_col in df.columns
has_pro_col = constants.pro_variant_col in df.columns

Expand Down Expand Up @@ -46,14 +43,11 @@ def drop_na_columns(df, inplace=False):
return df


def drop_na_rows(df, inplace=False):
def drop_na_rows(df):
"""
Drop rows where all non-HGVS entries are null. Operation is performed in
place.
"""
if not inplace:
df = utilities.copy_dataframe(df)

null_rows = df.loc[:, utilities.non_hgvs_columns(df.columns)].isnull().all(axis=1)
if sum(null_rows) > 0:
logger.warning(
Expand Down
50 changes: 15 additions & 35 deletions mavedbconvert/parsers.py
@@ -1,6 +1,7 @@
import os
import logging
from fqfa.fasta.fasta import parse_fasta_records
from fqfa.validator.validator import dna_bases_validator

from . import LOGGER, constants, exceptions

Expand Down Expand Up @@ -59,12 +60,6 @@ def parse_dst(dst):
if not os.path.isdir(path):
os.makedirs(path, exist_ok=True)
os.access(path, mode=os.W_OK)
except FileNotFoundError as e:
logger.error(
"Could not create directory {}. "
"Please ensure it is a valid path.".format(path)
)
raise e
except PermissionError as e:
logger.error("Permission denied when creating {}.".format(path))
raise e
Expand All @@ -87,28 +82,21 @@ def parse_program(program):
return program


def parse_wt_sequence(wtseq, program, non_coding=False):
def parse_wt_sequence(wtseq, coding=True):
if os.path.isfile(os.path.normpath(os.path.expanduser(wtseq))):
with open(os.path.normpath(os.path.expanduser(wtseq))) as fh:
_, wtseq = next(parse_fasta_records(fh))

if not constants.dna_re.fullmatch(wtseq):
if not dna_bases_validator(wtseq.upper()):
raise exceptions.InvalidWildTypeSequence(
"Sequence {} is not a valid DNA sequence.".format(wtseq)
"Wild-type sequence contains invalid characters."
)

if program in ("enrich", "empiric"):
if len(wtseq) % 3 != 0:
raise exceptions.SequenceFrameError(
"Enrich/EMPIRIC wild-type sequence must be a multiple of "
"three. Found length {}.".format(len(wtseq))
)
elif program == "enrich2" and not non_coding:
if len(wtseq) % 3 != 0:
raise exceptions.SequenceFrameError(
"Enrich2 wild-type sequence for a coding dataset "
"must be a multiple of three. Found length {}.".format(len(wtseq))
)
if coding and len(wtseq) % 3 != 0:
raise exceptions.SequenceFrameError(
f"Enrich2 wild-type sequence for a coding dataset "
"must be a multiple of three. Found length {len(wtseq)}."
)

return wtseq.upper()

Expand Down Expand Up @@ -137,16 +125,12 @@ def parse_score_column(value, input_type, program):
return value


def parse_offset(offset, program, non_coding=False):
def parse_offset(offset, coding=True):
offset = parse_numeric(offset, name="offset", dtype=int)
mult_of_three = abs(offset) % 3 == 0
if program == "enrich2":
if not non_coding and not mult_of_three:
raise ValueError(
"Enrich2 offset for a coding dataset must be a " "multiple of three."
)
elif not mult_of_three:
raise ValueError("EMPIRIC/Enrich offset must be a multiple of three.")
if coding and not mult_of_three:
raise ValueError("Offset for a coding dataset must be a multiple of three.")

return offset


Expand All @@ -167,14 +151,10 @@ def parse_docopt(docopt_args):

# Parse WT and Offset fields
parsed_kwargs["wt_sequence"] = parse_wt_sequence(
docopt_args.get("--wtseq", None),
program=program,
non_coding=not parsed_kwargs["is_coding"],
docopt_args.get("--wtseq", None), coding=parsed_kwargs["is_coding"]
)
parsed_kwargs["offset"] = parse_offset(
docopt_args.get("--offset", 0),
program=program,
non_coding=not parsed_kwargs["is_coding"],
docopt_args.get("--offset", 0), coding=parsed_kwargs["is_coding"]
)

# Parse Input related fields
Expand Down
5 changes: 1 addition & 4 deletions mavedbconvert/utilities.py
Expand Up @@ -252,10 +252,7 @@ def __init__(self, variant):

# Normalize to three letter codes
if self.ref and len(self.ref) == 1:
if self.ref == "?":
self.ref = "???"
else:
self.ref = AA_CODES[self.ref]
self.ref = AA_CODES[self.ref]
if self.alt and len(self.alt) == 1:
if self.alt == "?":
self.alt = "???"
Expand Down
6 changes: 4 additions & 2 deletions mavedbconvert/validators.py
@@ -1,4 +1,5 @@
import logging
from abc import ABCMeta, abstractmethod

import hgvsp

Expand All @@ -16,14 +17,15 @@
logger = logging.getLogger(LOGGER)


class ValidationBackend(object):
class ValidationBackend(metaclass=ABCMeta):
"""
Validation backend which provides the interface `validate` for validating
HGVS_ variants.
"""

@abstractmethod
def validate(self, variant):
raise NotImplementedError()
pass # pragma: no cover


class HGVSPatternsBackend(ValidationBackend):
Expand Down
21 changes: 8 additions & 13 deletions mavedbconvert/tests/__init__.py → tests/__init__.py
@@ -1,17 +1,20 @@
import os
import shutil
from unittest import TestCase
from tempfile import TemporaryDirectory
import unittest
import tempfile

import pandas as pd

import logging

logging.disable(logging.CRITICAL)


__all__ = [
"test_base",
"test_empiric",
"test_enrich",
"test_enrich2",
"test_fasta",
"test_utilities",
"test_filters",
"test_validators",
Expand All @@ -20,31 +23,23 @@


# TODO: think up a better name for this class
# TODO: remove the old self.bin stuff
class ProgramTestCase(TestCase):
class ProgramTestCase(unittest.TestCase):
def setUp(self):
self._data_dir = TemporaryDirectory() # store the object
self._data_dir = tempfile.TemporaryDirectory() # store the object
self.data_dir = os.path.join(
self._data_dir.name, "data"
) # store the directory path
shutil.copytree(
src=os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"),
dst=self.data_dir,
)
self.bin = []

def mock_multi_sheet_excel_file(self, path, data):
writer = pd.ExcelWriter(path, engine="xlsxwriter")
for i, di in enumerate(data):
df = pd.DataFrame(di)
df.to_excel(writer, sheet_name="Sheet{}".format(i), index=False)
writer.save()
self.bin.append(path)

def tearDown(self):
self._data_dir.cleanup()
for path in self.bin:
if os.path.exists(path) and os.path.isfile(path):
os.remove(path)
elif os.path.exists(path) and os.path.isdir(path):
os.removedirs(path)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 17f31c3

Please sign in to comment.