Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyOpenSci REVIEW - 59 some more minor changes #60

Merged
merged 6 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions docs/examples/example_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,23 @@
"from pynteny import Search, Build, Download"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now create a directory to store results"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"Path(\"example_api/data\").mkdir(exist_ok=False, parents=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -81,27 +98,27 @@
"\n",
"where gene position, locus start, and locus end are taken with respect to the contig.\n",
"\n",
"__NOTE__: To follow this example, you don't need to download _E. coli's_ genome, since it has been already downloaded during Pynteny's installation. But, if you still want to download it, you can get it [here](https://www.ncbi.nlm.nih.gov/nuccore/U00096.2)."
"__NOTE__: You'll need _E. coli's_ genome to follow this example. It's already downloaded in the repo (`tests/test_data/MG1655.gb`), but you can also download it [here](https://www.ncbi.nlm.nih.gov/nuccore/U00096.2)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2022-10-04 12:02:20,103 | INFO: Building annotated peptide database\n",
"2022-10-04 12:02:20,735 | INFO: Parsing GenBank data.\n",
"2022-10-04 12:02:21,295 | INFO: Database built successfully!\n"
"2023-01-31 10:14:01,795 | INFO: Building annotated peptide database\n",
"2023-01-31 10:14:02,289 | INFO: Parsing GenBank data.\n",
"2023-01-31 10:14:02,705 | INFO: Database built successfully!\n"
]
}
],
"source": [
"Build(\n",
" data=\"example/data/MG1655.gb\",\n",
" data=\"../../tests/test_data/MG1655.gb\",\n",
" outfile=\"example_api/data/labelled_MG1655.fasta\",\n",
" logfile=None\n",
").run()"
Expand Down Expand Up @@ -396,7 +413,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0]"
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
4 changes: 2 additions & 2 deletions src/pynteny/app/main_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
},
)

with open(parent_dir / "assets/styles.css", "r") as file:
with open(parent_dir / "assets/styles.css", "r", encoding="UTF-8") as file:
css_text = file.read()
st.markdown(f"<style>{css_text}</style>", unsafe_allow_html=True)

with open(parent_dir / "assets/script.js", "r") as file:
with open(parent_dir / "assets/script.js", "r", encoding="UTF-8") as file:
js_text = file.read()
st.components.v1.html(f"<script>{js_text}</script>")

Expand Down
2 changes: 1 addition & 1 deletion src/pynteny/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def get_help_str(subcommand: str) -> str:
with tempfile.NamedTemporaryFile(mode="w+") as file:
parser.print_help(file)
file.flush()
with open(file.name) as help_file:
with open(file.name, encoding="UTF-8") as help_file:
help_str = help_file.read()
return help_str

Expand Down
9 changes: 4 additions & 5 deletions src/pynteny/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,19 +396,18 @@ def add_HMM_meta_info_to_hits(self, hmm_meta: Path) -> SyntenyHits:
return self._synteny_hits
pgap = PGAP(hmm_meta)
self._synteny_hits[fields] = ""
# for i, row in self._synteny_hits.iterrows():
for row in self._synteny_hits.itertuples():
i = getattr(row, "Index")
hmm_group = getattr(row, "hmm")
meta_values = [
[
str(v).replace("nan", "")
for k, v in pgap.get_meta_info_for_HMM(hmm).items()
if k != "#ncbi_accession"
]
for hmm in hmm_group.split("|") # row.hmm.split("|")
for hmm in row.hmm.split("|")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥

]
self._synteny_hits.loc[row.Index, fields] = [
"|".join(v) for v in zip(*meta_values)
]
self._synteny_hits.loc[i, fields] = ["|".join(v) for v in zip(*meta_values)]
return SyntenyHits(self._synteny_hits)

def write_to_TSV(self, output_tsv: Path) -> None:
Expand Down
4 changes: 2 additions & 2 deletions src/pynteny/hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def hmm_names(self) -> list[str]:
return [hmm_path.stem for hmm_path in self._input_hmms]

@staticmethod
def parse_HMM_search_output(hmmer_output: str) -> pd.DataFrame:
def parse_HMM_search_output(hmmer_output: Path) -> pd.DataFrame:
"""Parse hmmsearch or hmmscan summary table output file.

Args:
Expand All @@ -79,7 +79,7 @@ def parse_HMM_search_output(hmmer_output: str) -> pd.DataFrame:
"""
attribs = ["id", "bias", "bitscore", "description"]
hits = defaultdict(list)
with open(hmmer_output) as handle:
with open(hmmer_output, encoding="UTF-8") as handle:
for queryresult in SearchIO.parse(handle, "hmmer3-tab"):
for hit in queryresult.hits:
for attrib in attribs:
Expand Down
16 changes: 8 additions & 8 deletions src/pynteny/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def remove_corrupted_sequences(
fasta = pyfastx.Fasta(
self.file_path.as_posix(), build_index=False, full_name=True
)
with open(output_file, "w+") as outfile:
with open(output_file, "w+", encoding="UTF-8") as outfile:
for record_name, record_seq in fasta:
if is_peptide and (not keep_stop_codon):
record_seq = remove_stop_sodon_signals(record_seq)
Expand Down Expand Up @@ -255,12 +255,12 @@ def split_by_contigs(self, output_dir: Path = None) -> None:
self.file_path.as_posix(), build_index=False, full_name=True
)
for contig_name, seq in contigs:
outfile = (
output_file = (
output_dir / f"{contig_name.split(' ')[0]}{self._input_file.suffix}"
)
with open(outfile, "w+") as file:
file.write(f">{contig_name}\n")
file.write(seq + "\n")
with open(output_file, "w+", encoding="UTF-8") as outfile:
outfile.write(f">{contig_name}\n")
outfile.write(seq + "\n")

def filter_by_minimum_length(
self, min_length: int, output_file: Path = None, point_to_new_file: bool = True
Expand All @@ -280,7 +280,7 @@ def filter_by_minimum_length(
fasta = pyfastx.Fasta(
self.file_path.as_posix(), build_index=False, full_name=True
)
with open(output_file, "w+") as outfile:
with open(output_file, "w+", encoding="UTF-8") as outfile:
for record_name, record_seq in fasta:
if len(record_seq) >= min_length:
outfile.write(f">{record_name}\n{record_seq}\n")
Expand Down Expand Up @@ -311,7 +311,7 @@ def from_prodigal_output(
Path(prodigal_faa.parent) / f"{prodigal_faa.stem}_longlabels.fasta"
)
data = pyfastx.Fasta(prodigal_faa.as_posix(), build_index=False, full_name=True)
with open(output_file, "w+") as outfile:
with open(output_file, "w+", encoding="UTF-8") as outfile:
for record_name, record_seq in data:
name_list = record_name.split(" ")
if len(name_list) < number_prodigal_record_fields:
Expand Down Expand Up @@ -367,7 +367,7 @@ def from_genbank(
Path(gbk_files.pop().parent) / f"{prefix}sequence_database.fasta"
)

with open(output_file, "w+") as outfile:
with open(output_file, "w+", encoding="UTF-8") as outfile:
for gbk_contig in gbk_contigs:
gene_counter = 0
for feature in gbk_contig.features:
Expand Down
24 changes: 13 additions & 11 deletions src/pynteny/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import shutil
import logging
from pathlib import Path
from typing import Union
from argparse import ArgumentParser

from pynteny.filter import SyntenyHits, filter_FASTA_by_synteny_structure
from pynteny.hmm import PGAP
Expand All @@ -24,11 +26,11 @@
from pynteny.preprocessing import Database


def init_logger(args) -> logging.Logger:
def init_logger(args: Union[CommandArgs, ArgumentParser]) -> logging.Logger:
"""Initialize logger object

Args:
args (_type_): command arguments object
args (Union[CommandArgs, ArgumentParser]): arguments object

Returns:
logging.Logger: initialized logger object
Expand All @@ -46,11 +48,11 @@ def init_logger(args) -> logging.Logger:
return logger


def synteny_search(args) -> SyntenyHits:
def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits:
"""Search peptide database by synteny structure containing HMMs.

Args:
args (argparse.ArgumentParser): arguments object.
args (Union[CommandArgs, ArgumentParser]): arguments object.

Returns:
SyntenyHits: instance of SyntenyHits.
Expand Down Expand Up @@ -157,12 +159,12 @@ def synteny_search(args) -> SyntenyHits:
return synteny_hits


def build_database(args) -> None:
def build_database(args: Union[CommandArgs, ArgumentParser]) -> None:
"""Build annotated peptide database from input assembly
or GenBank data.

Args:
args (argparse.ArgumentParser): arguments object.
args (Union[CommandArgs, ArgumentParser]): arguments object.
"""
logger = init_logger(args)

Expand All @@ -175,11 +177,11 @@ def build_database(args) -> None:
logging.shutdown()


def parse_gene_ids(args) -> str:
def parse_gene_ids(args: Union[CommandArgs, ArgumentParser]) -> str:
"""Convert gene symbols to hmm names.

Args:
args (argparse.ArgumentParser): arguments object.
args (Union[CommandArgs, ArgumentParser]): arguments object.

Returns:
str: synteny structure where gene symbols are replaced
Expand Down Expand Up @@ -208,11 +210,11 @@ def parse_gene_ids(args) -> str:
return gene_synteny_struc


def download_hmms(args) -> None:
def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None:
"""Download HMM (PGAP) database from NCBI.

Args:
args (argparse.ArgumentParser): arguments object.
args (Union[CommandArgs, ArgumentParser]): arguments object.
"""
logger = init_logger(args)
module_dir = Path(__file__).parent
Expand Down Expand Up @@ -275,7 +277,7 @@ def run_app() -> None:
terminal_execute(cmd_str)


def get_citation(args, silent: bool = False) -> str:
def get_citation(args: Union[CommandArgs, ArgumentParser], silent: bool = False) -> str:
"""Get Pynteny citation string.

Args:
Expand Down
6 changes: 3 additions & 3 deletions src/pynteny/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def initialize_config_file() -> Path:
"PGAP_meta_file": "",
"streamlit_process": "",
}
with open(config_file, "w") as f:
with open(config_file, "w", encoding="UTF-8") as f:
json.dump(config, f, indent=4)
return config_file

Expand All @@ -71,13 +71,13 @@ def get_config(self) -> dict:
Returns:
dict: dict containing fields and values of config file.
"""
with open(self._config_file, "r") as file:
with open(self._config_file, "r", encoding="UTF-8") as file:
config = json.loads(file.read())
return config

def write_config(self) -> None:
"""Write config dict to file."""
with open(self._config_file, "w") as f:
with open(self._config_file, "w", encoding="UTF-8") as f:
json.dump(self._config, f, indent=4)

def update_config(self, key: str, value: str) -> None:
Expand Down