Robaina · Robaina · Jan 31, 2023 · Jan 31, 2023 · Jan 31, 2023 · Jan 31, 2023
diff --git a/docs/examples/example_api.ipynb b/docs/examples/example_api.ipynb
@@ -40,6 +40,23 @@
     "from pynteny import Search, Build, Download"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's now create a directory to store results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Path(\"example_api/data\").mkdir(exist_ok=False, parents=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -81,27 +98,27 @@
     "\n",
     "where gene position, locus start, and locus end are taken with respect to the contig.\n",
     "\n",
-    "__NOTE__: To follow this example, you don't need to download _E. coli's_ genome, since it has been already downloaded during Pynteny's installation. But, if you still want to download it, you can get it [here](https://www.ncbi.nlm.nih.gov/nuccore/U00096.2)."
+    "__NOTE__: You'll need _E. coli's_ genome to follow this example. It's already downloaded in the repo (`tests/test_data/MG1655.gb`), but you can also download it [here](https://www.ncbi.nlm.nih.gov/nuccore/U00096.2)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2022-10-04 12:02:20,103 | INFO: Building annotated peptide database\n",
-      "2022-10-04 12:02:20,735 | INFO: Parsing GenBank data.\n",
-      "2022-10-04 12:02:21,295 | INFO: Database built successfully!\n"
+      "2023-01-31 10:14:01,795 | INFO: Building annotated peptide database\n",
+      "2023-01-31 10:14:02,289 | INFO: Parsing GenBank data.\n",
+      "2023-01-31 10:14:02,705 | INFO: Database built successfully!\n"
      ]
     }
    ],
    "source": [
     "Build(\n",
-    "    data=\"example/data/MG1655.gb\",\n",
+    "    data=\"../../tests/test_data/MG1655.gb\",\n",
     "    outfile=\"example_api/data/labelled_MG1655.fasta\",\n",
     "    logfile=None\n",
     ").run()"
@@ -396,7 +413,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0]"
+   "version": "3.10.8"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/src/pynteny/app/main_page.py b/src/pynteny/app/main_page.py
@@ -41,11 +41,11 @@
     },
 )
 
-with open(parent_dir / "assets/styles.css", "r") as file:
+with open(parent_dir / "assets/styles.css", "r", encoding="UTF-8") as file:
     css_text = file.read()
 st.markdown(f"<style>{css_text}</style>", unsafe_allow_html=True)
 
-with open(parent_dir / "assets/script.js", "r") as file:
+with open(parent_dir / "assets/script.js", "r", encoding="UTF-8") as file:
     js_text = file.read()
 st.components.v1.html(f"<script>{js_text}</script>")
 

diff --git a/src/pynteny/cli.py b/src/pynteny/cli.py
@@ -150,7 +150,7 @@ def get_help_str(subcommand: str) -> str:
         with tempfile.NamedTemporaryFile(mode="w+") as file:
             parser.print_help(file)
             file.flush()
-            with open(file.name) as help_file:
+            with open(file.name, encoding="UTF-8") as help_file:
                 help_str = help_file.read()
         return help_str
 

diff --git a/src/pynteny/filter.py b/src/pynteny/filter.py
@@ -396,19 +396,18 @@ def add_HMM_meta_info_to_hits(self, hmm_meta: Path) -> SyntenyHits:
             return self._synteny_hits
         pgap = PGAP(hmm_meta)
         self._synteny_hits[fields] = ""
-        # for i, row in self._synteny_hits.iterrows():
         for row in self._synteny_hits.itertuples():
-            i = getattr(row, "Index")
-            hmm_group = getattr(row, "hmm")
             meta_values = [
                 [
                     str(v).replace("nan", "")
                     for k, v in pgap.get_meta_info_for_HMM(hmm).items()
                     if k != "#ncbi_accession"
                 ]
-                for hmm in hmm_group.split("|")  # row.hmm.split("|")
+                for hmm in row.hmm.split("|")
+            ]
+            self._synteny_hits.loc[row.Index, fields] = [
+                "|".join(v) for v in zip(*meta_values)
             ]
-            self._synteny_hits.loc[i, fields] = ["|".join(v) for v in zip(*meta_values)]
         return SyntenyHits(self._synteny_hits)
 
     def write_to_TSV(self, output_tsv: Path) -> None:

diff --git a/src/pynteny/hmm.py b/src/pynteny/hmm.py
@@ -68,7 +68,7 @@ def hmm_names(self) -> list[str]:
         return [hmm_path.stem for hmm_path in self._input_hmms]
 
     @staticmethod
-    def parse_HMM_search_output(hmmer_output: str) -> pd.DataFrame:
+    def parse_HMM_search_output(hmmer_output: Path) -> pd.DataFrame:
         """Parse hmmsearch or hmmscan summary table output file.
 
         Args:
@@ -79,7 +79,7 @@ def parse_HMM_search_output(hmmer_output: str) -> pd.DataFrame:
         """
         attribs = ["id", "bias", "bitscore", "description"]
         hits = defaultdict(list)
-        with open(hmmer_output) as handle:
+        with open(hmmer_output, encoding="UTF-8") as handle:
             for queryresult in SearchIO.parse(handle, "hmmer3-tab"):
                 for hit in queryresult.hits:
                     for attrib in attribs:

diff --git a/src/pynteny/preprocessing.py b/src/pynteny/preprocessing.py
@@ -203,7 +203,7 @@ def remove_corrupted_sequences(
         fasta = pyfastx.Fasta(
             self.file_path.as_posix(), build_index=False, full_name=True
         )
-        with open(output_file, "w+") as outfile:
+        with open(output_file, "w+", encoding="UTF-8") as outfile:
             for record_name, record_seq in fasta:
                 if is_peptide and (not keep_stop_codon):
                     record_seq = remove_stop_sodon_signals(record_seq)
@@ -255,12 +255,12 @@ def split_by_contigs(self, output_dir: Path = None) -> None:
             self.file_path.as_posix(), build_index=False, full_name=True
         )
         for contig_name, seq in contigs:
-            outfile = (
+            output_file = (
                 output_dir / f"{contig_name.split(' ')[0]}{self._input_file.suffix}"
             )
-            with open(outfile, "w+") as file:
-                file.write(f">{contig_name}\n")
-                file.write(seq + "\n")
+            with open(output_file, "w+", encoding="UTF-8") as outfile:
+                outfile.write(f">{contig_name}\n")
+                outfile.write(seq + "\n")
 
     def filter_by_minimum_length(
         self, min_length: int, output_file: Path = None, point_to_new_file: bool = True
@@ -280,7 +280,7 @@ def filter_by_minimum_length(
         fasta = pyfastx.Fasta(
             self.file_path.as_posix(), build_index=False, full_name=True
         )
-        with open(output_file, "w+") as outfile:
+        with open(output_file, "w+", encoding="UTF-8") as outfile:
             for record_name, record_seq in fasta:
                 if len(record_seq) >= min_length:
                     outfile.write(f">{record_name}\n{record_seq}\n")
@@ -311,7 +311,7 @@ def from_prodigal_output(
                 Path(prodigal_faa.parent) / f"{prodigal_faa.stem}_longlabels.fasta"
             )
         data = pyfastx.Fasta(prodigal_faa.as_posix(), build_index=False, full_name=True)
-        with open(output_file, "w+") as outfile:
+        with open(output_file, "w+", encoding="UTF-8") as outfile:
             for record_name, record_seq in data:
                 name_list = record_name.split(" ")
                 if len(name_list) < number_prodigal_record_fields:
@@ -367,7 +367,7 @@ def from_genbank(
                 Path(gbk_files.pop().parent) / f"{prefix}sequence_database.fasta"
             )
 
-        with open(output_file, "w+") as outfile:
+        with open(output_file, "w+", encoding="UTF-8") as outfile:
             for gbk_contig in gbk_contigs:
                 gene_counter = 0
                 for feature in gbk_contig.features:

diff --git a/src/pynteny/subcommands.py b/src/pynteny/subcommands.py
@@ -10,6 +10,8 @@
 import shutil
 import logging
 from pathlib import Path
+from typing import Union
+from argparse import ArgumentParser
 
 from pynteny.filter import SyntenyHits, filter_FASTA_by_synteny_structure
 from pynteny.hmm import PGAP
@@ -24,11 +26,11 @@
 from pynteny.preprocessing import Database
 
 
-def init_logger(args) -> logging.Logger:
+def init_logger(args: Union[CommandArgs, ArgumentParser]) -> logging.Logger:
     """Initialize logger object
 
     Args:
-        args (_type_): command arguments object
+        args (Union[CommandArgs, ArgumentParser]): arguments object
 
     Returns:
         logging.Logger: initialized logger object
@@ -46,11 +48,11 @@ def init_logger(args) -> logging.Logger:
     return logger
 
 
-def synteny_search(args) -> SyntenyHits:
+def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits:
     """Search peptide database by synteny structure containing HMMs.
 
     Args:
-        args (argparse.ArgumentParser): arguments object.
+        args (Union[CommandArgs, ArgumentParser]): arguments object.
 
     Returns:
         SyntenyHits: instance of SyntenyHits.
@@ -157,12 +159,12 @@ def synteny_search(args) -> SyntenyHits:
     return synteny_hits
 
 
-def build_database(args) -> None:
+def build_database(args: Union[CommandArgs, ArgumentParser]) -> None:
     """Build annotated peptide database from input assembly
     or GenBank data.
 
     Args:
-        args (argparse.ArgumentParser): arguments object.
+        args (Union[CommandArgs, ArgumentParser]): arguments object.
     """
     logger = init_logger(args)
 
@@ -175,11 +177,11 @@ def build_database(args) -> None:
     logging.shutdown()
 
 
-def parse_gene_ids(args) -> str:
+def parse_gene_ids(args: Union[CommandArgs, ArgumentParser]) -> str:
     """Convert gene symbols to hmm names.
 
     Args:
-        args (argparse.ArgumentParser): arguments object.
+        args (Union[CommandArgs, ArgumentParser]): arguments object.
 
     Returns:
         str: synteny structure where gene symbols are replaced
@@ -208,11 +210,11 @@ def parse_gene_ids(args) -> str:
     return gene_synteny_struc
 
 
-def download_hmms(args) -> None:
+def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None:
     """Download HMM (PGAP) database from NCBI.
 
     Args:
-        args (argparse.ArgumentParser): arguments object.
+        args (Union[CommandArgs, ArgumentParser]): arguments object.
     """
     logger = init_logger(args)
     module_dir = Path(__file__).parent
@@ -275,7 +277,7 @@ def run_app() -> None:
     terminal_execute(cmd_str)
 
 
-def get_citation(args, silent: bool = False) -> str:
+def get_citation(args: Union[CommandArgs, ArgumentParser], silent: bool = False) -> str:
     """Get Pynteny citation string.
 
     Args:

diff --git a/src/pynteny/utils.py b/src/pynteny/utils.py
@@ -57,7 +57,7 @@ def initialize_config_file() -> Path:
                 "PGAP_meta_file": "",
                 "streamlit_process": "",
             }
-            with open(config_file, "w") as f:
+            with open(config_file, "w", encoding="UTF-8") as f:
                 json.dump(config, f, indent=4)
         return config_file
 
@@ -71,13 +71,13 @@ def get_config(self) -> dict:
         Returns:
             dict: dict containing fields and values of config file.
         """
-        with open(self._config_file, "r") as file:
+        with open(self._config_file, "r", encoding="UTF-8") as file:
             config = json.loads(file.read())
         return config
 
     def write_config(self) -> None:
         """Write config dict to file."""
-        with open(self._config_file, "w") as f:
+        with open(self._config_file, "w", encoding="UTF-8") as f:
             json.dump(self._config, f, indent=4)
 
     def update_config(self, key: str, value: str) -> None: