# Hyperpackage Creation v1.0
This Colab UI is an easy-to-use guide for the Hyperpackage Creation software. This snakemake workflow extends the functionality of TreeSAPP to create composite reference packages (phylogentic trees + other tools) based on functional homology via Rhea ID, EC number, or other groupings rather than from manually curated collections of protein sequences.

Follow the steps in this notebook to create your own Hyperpackage or follow this in-depth tutorial.

In [2]:
!cd /content

In [1]:
#@title Install Dependencies
!cd /content
!pip install snakemake
!pip install miniconda
!apt-get update && apt-get install -y graphviz
!git clone https://github.com/RyloByte/TS-Capstone-2025.git
%cd TS-Capstone-2025
!cp config.yaml.example config.yaml

import os
os.system("wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh")
os.system("bash Miniforge3-Linux-x86_64.sh -bfp /usr/local")

import sys
sys.path.append("/usr/local/lib/python3.10/site-packages")
os.system("ln -s /usr/local/bin/conda /usr/bin/conda")
os.system("conda --version")

!conda env create -f environment.yaml

Collecting snakemake
  Downloading snakemake-9.1.10-py3-none-any.whl.metadata (2.7 kB)
Collecting appdirs (from snakemake)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting immutables (from snakemake)
  Downloading immutables-0.21-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting configargparse (from snakemake)
  Downloading ConfigArgParse-1.7-py3-none-any.whl.metadata (23 kB)
Collecting connection_pool>=0.0.3 (from snakemake)
  Downloading connection_pool-0.0.3.tar.gz (3.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting humanfriendly (from snakemake)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting pulp<3.1,>=2.3.1 (from snakemake)
  Downloading PuLP-3.0.2-py3-none-any.whl.metadata (6.7 kB)
Collecting reretry (from snakemake)
  Downloading reretry-0.11.8-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting snakemake-interface-execu

# Hyperpackage Creation

In [2]:
#@title Configuration
ID_Type = 'Rhea-ID' #@param ["Rhea-ID", "EC-Number"]{allow-input: false}
ID = '10596' #@param {type:"string"}

In [3]:
#@title Advanced Configuration - No Need To Run If Using Default Settings { display-mode: "form" }

#@markdown ### Cluster Database Settings
Chunk_Size = 10_000_000  #@param {type:"integer", min:1}

#@markdown ---
#@markdown ### Structure Clustering Settings
Min_StructCluster_Size = 5  #@param {type:"integer", min:1}
Max_StructCluster_Size = None  #@param {type:"raw"}

#@markdown ---
#@markdown ### Sequence Clustering Settings
Mute_MMSeqs_Output = True  #@param {type:"boolean"}
Min_SeqCluster_Size = 5  #@param {type:"integer", min:1}
Max_SeqCluster_Size = None  #@param {type:"raw"}

#@markdown ---
#@markdown ### MMseqs2 Parameters
Min_Seq_ID = 0.9  #@param {type:"number", min:0.0, max:1.0}
Cov_Mode = 5  #@param {type:"integer", min:0, max:5}
Kmer_size = 15  #@param {type:"integer", min:1}
Shuffle = 0  #@param [0, 1]
Remove_Temp_Files = 0  #@param [0, 1]
Alignment_Mode = 3  #@param {type:"integer", min:0, max:3}
Realign = 1  #@param [0, 1]

#@markdown ---
#@markdown ### TreeSAPP Create Settings
Mute_TreeSAPP_Output = True  #@param {type:"boolean"}

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interactive

Extra_Arguments = ""  #@param {type:"string"}

#@markdown ---
#@markdown ### TreeSAPP Assign Settings - None Currently


# ------------------- YAML Generation -------------------
!pip install -q pyyaml
import yaml

config = {
    "cluster_db": {
        "filter_by_sprot": True,
        "chunk_size": Chunk_Size,
    },
    "structure_clustering": {
        "min_cluster_size": Min_StructCluster_Size,
        "max_cluster_size": Max_StructCluster_Size,
    },
    "sequence_clustering": {
        "mute_mmseqs": Mute_MMSeqs_Output,
        "min_cluster_size": Min_SeqCluster_Size,
        "max_cluster_size": Max_SeqCluster_Size,
        "mmseqs_args": [
            f"--min-seq-id {Min_Seq_ID}",
            f"--cov-mode {Cov_Mode}",
            f"-k {Kmer_size}",
            f"--shuffle {Shuffle}",
            f"--remove-tmp-files {Remove_Temp_Files}",
            f"--alignment-mode {Alignment_Mode}",
            f"--realign {Realign}",
        ]
    },
    "treesapp_create": {
        "mute_treesapp": Mute_TreeSAPP_Output,
        "extra_args": Extra_Arguments.split() if Extra_Arguments else [],
    },
    "treesapp_assign": {
        "num_threads": 2
    }
}

# Write YAML to file
with open("config.yaml", "w") as f:
    yaml.dump(config, f, default_flow_style=False)

print("✅ Config file saved as `config.yaml`")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/767.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m767.5/767.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Config file saved as `config.yaml`


In [5]:
#@title Run TreeSAPP Create
import subprocess

if ID_Type == 'Rhea-ID':
    !conda run -n snakemake_env snakemake --use-conda data/hyperpackages/rhea_{ID}.refpkg.tar.gz
    # subprocess.run(["snakemake", "--use-conda", f"data/hyperpackages/rhea_{ID}.refpkg.tar.gz"])
elif ID_Type == 'EC-Number':
    !conda run -n snakemake_env snakemake --use-conda data/hyperpackages/ec_{ID}.refpkg.tar.gz
    # subprocess.run(["snakemake", "--use-conda", f"data/hyperpackages/ec_{ID}.refpkg.tar.gz"])

Found clusters for 548264/572970 (95.7%) SwissProt accessions
Querying structure cluster db for 735 sequences in data/rhea_10596.fasta...
Found 218 clusters with 1316 sequences
Cluster size / count
  1: █████████████████████████████████████████████████████████████████████████████ 77
  2: ██████████████████████████████████ 34
  3: ████████████████████ 20
  4: ███████████ 11
<----------- MIN CLUSTER SIZE: 5 ----------->
  5: ████████████ 12
  6: ████████ 8
  7: ███████ 7
  8: █ 1
  9: ███████ 7
 10: ████ 4
 11: █████ 5
 12: █ 1
 13: ██ 2
 14: █████ 5
 16: ██ 2
 17: ██ 2
 18: ██ 2
 19: █ 1
 20: █ 1
 21: ███ 3
 22: █ 1
 23: █ 1
 25: ██ 2
 26: █ 1
 29: █ 1
 31: █ 1
 32: █ 1
 34: █ 1
 42: ██ 2
 47: █ 1
 54: █ 1
Keeping 76 clusters >= 5 (1067 total sequences)
Cluster size / count
  1: ████████████████████████████████████████████████████████████████████████████████ 614
  2: ████████████ 98
  3: ███ 28
  4: █ 13
<----------- MIN CLUSTER SIZE: 5 ----------->
  5: █ 9
  8: █ 2
 10: █ 1
 11: █ 2
 

In [6]:
!conda create -n treesapp_cenv -c bioconda -c conda-forge -y
!conda install -n treesapp_cenv -c bioconda -c conda-forge treesapp -y

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ done


    current version: 24.11.3
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/treesapp_cenv




Downloading and Extracting Packages:

Preparing transaction: - done
Verifying transaction: | / - done
Executing transaction: | done
#
# To activate this environment, use
#
#     $ conda activate treesapp_cenv
#
# To deactivate an active environment, use
#
#     $ conda deactivate

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.js

In [8]:
#@title TreeSAPP Assign Steps
#@markdown 1. Add your desired fasta file for TreeSAPP assign into `TS-Capstone-2025/data/assign_fastas` directory
#@markdown 2. Input the name of your fasta file (without .fasta extension)
fasta = "geneX" #@param {type:"string"}

In [17]:
import subprocess

target = f"data/assigned_hyperpackages/{fasta}/rhea_{ID}.refpkg.tar.gz"
command = f"""
source activate snakemake_env && \
snakemake --use-conda --cores 2 {target}
"""

subprocess.run(command, shell=True, executable="/bin/bash")

CompletedProcess(args='\nsource activate snakemake_env && snakemake --use-conda --cores 2 data/assigned_hyperpackages/geneX/rhea_10596.refpkg.tar.gz\n', returncode=1)

In [20]:
#@title Run TreeSAPP Assign

fasta = "geneX"
ID = "10596"
# print("!snakemake --use-conda data/assigned_hyperpackages/geneX/rhea_10596.refpkg.tar.gz")
!snakemake --use-conda data/assigned_hyperpackages/geneX/rhea_10596.refpkg.tar.gz
# if ID_Type == 'Rhea-ID':
#   !snakemake --use-conda data/assigned_hyperpackages/geneX/rhea_10596.refpkg.tar.gz
#   # !conda run -n snakemake_env snakemake --use-conda --cores 2 data/assigned_hyperpackages/{fasta}/rhea_{ID}.refpkg.tar.gz
# elif ID_Type == 'EC-Number':
#   !conda run -n snakemake_env snakemake --use-conda --cores 2 data/assigned_hyperpackages/{fasta}/ec_{ID}.refpkg.tar.gz

[33mAssuming unrestricted shared filesystem usage.[0m
[33mhost: 77faee3425ad[0m
[33mBuilding DAG of jobs...[0m
[33mYour conda installation is not configured to use strict channel priorities. This is however important for having robust and correct environments (for details, see https://conda-forge.org/docs/user/tipsandtricks.html). Please consider to configure strict priorities by executing 'conda config --set channel_priority strict'.[0m
[33mUsing shell: /usr/bin/bash[0m
[32mProvided cores: 2[0m
[32mRules claiming more threads will be scaled down.[0m
[33mJob stats:
job                count
---------------  -------
treesapp_assign        1
total                  1
[0m
[33mSelect jobs to execute...[0m
[33mExecute 1 jobs...[0m

[32m[Sun Apr 13 23:54:51 2025]
localrule treesapp_assign:
    input: data/hyperpackages/rhea_10596.refpkg.tar.gz, data/assign_fastas/geneX.fasta
    output: data/assigned_hyperpackages/geneX/rhea_10596.refpkg.tar.gz
    jobid: 0
    reason: Mis

In [None]:
# Run hyperpackage creation

import ipywidgets as widgets
from IPython.display import display, Markdown
import subprocess

# input_file = widgets.Text(value='data/example.fasta', description='Input File:')
run_button = widgets.Button(description="Run Pipeline")

rhea_id = widgets.Text(value='10596', description='Rhea ID:')

def on_button_clicked(b):
    print("Running Snakemake...")
    command = f"snakemake --use-conda data/hyperpackage/rhea_{rhea_id}.refpkg.tar.gz"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print("ERROR:\n", result.stderr)

display(rhea_id, run_button)
run_button.on_click(on_button_clicked)

Text(value='10596', description='Rhea ID:')

Button(description='Run Pipeline', style=ButtonStyle())

Running Snakemake...

ERROR:
 /bin/sh: 1: Syntax error: "(" unexpected



In [None]:
# Look at statistics

In [None]:
# Save Hyperpackage

# Hyperpackage TreeSAPPAssign

In [None]:
# Use Hyperpackage or select
subprocess.run(["snakemake", "--use-conda", f"data/hyperpackages/assigned_{rhea_id_1}.refpkg.tar.gz"])

In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown
import subprocess

input_file = widgets.Text(value='data/example.fasta', description='Input File:')
run_button = widgets.Button(description="Run Pipeline")

def on_button_clicked(b):
    print("Running Snakemake...")
    command = f"snakemake --cores 2 --config input={input_file.value}"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print("ERROR:\n", result.stderr)

display(input_file, run_button)
run_button.on_click(on_button_clicked)