<a href="https://colab.research.google.com/github/RyloByte/TS-Capstone-2025/blob/%2336/notebooks/colab_hyperpackage_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperpackage Creation v1.0
This Colab UI is an easy-to-use guide for the Hyperpackage Creation software. This snakemake workflow extends the functionality of TreeSAPP to create composite reference packages (phylogentic trees + other tools) based on functional homology via Rhea ID, EC number, or other groupings rather than from manually curated collections of protein sequences.

Follow the steps in this notebook to create your own Hyperpackage or follow this in-depth tutorial.

In [None]:
#@title Install Dependencies
!cd /content
!pip install snakemake
!pip install miniconda
!apt-get update && apt-get install -y graphviz
!git clone --branch '#36' --single-branch https://github.com/RyloByte/TS-Capstone-2025.git
%cd TS-Capstone-2025
!cp config.yaml.example config.yaml

import os
os.system("wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh")
os.system("bash Miniforge3-Linux-x86_64.sh -bfp /usr/local")

import sys
sys.path.append("/usr/local/lib/python3.10/site-packages")
os.system("ln -s /usr/local/bin/conda /usr/bin/conda")
os.system("conda --version")

!conda env create -f environment.yaml

# Hyperpackage Creation

In [None]:
#@title Configuration
ID_type = 'Rhea-ID' #@param ["Rhea-ID", "EC-Number"]{allow-input: false}
ID = '10596' #@param {type:"string"}

#@markdown Link to finding Rhea or EC numbers

if ID_type == 'Rhea-ID':
    sample = 'rhea_' + ID
elif ID_type == 'EC-Number':
    sample = 'ec_' + ID

In [None]:
#@title Advanced Configuration - No Need To Run If Using Default Settings { display-mode: "form" }

#@markdown ### Cluster Database Settings
Chunk_Size = 10_000_000  #@param {type:"integer", min:1}

#@markdown ---
#@markdown ### Structure Clustering Settings
Min_StructCluster_Size = 5  #@param {type:"integer", min:1}
Max_StructCluster_Size = None  #@param {type:"raw"}

#@markdown ---
#@markdown ### Sequence Clustering Settings
Mute_MMSeqs_Output = True  #@param {type:"boolean"}
Min_SeqCluster_Size = 5  #@param {type:"integer", min:1}
Max_SeqCluster_Size = None  #@param {type:"raw"}

#@markdown ---
#@markdown ### MMseqs2 Parameters
Min_Seq_ID = 0.9  #@param {type:"number", min:0.0, max:1.0}
Cov_Mode = 5  #@param {type:"integer", min:0, max:5}
Kmer_size = 15  #@param {type:"integer", min:1}
Shuffle = 0  #@param [0, 1]
Remove_Temp_Files = 0  #@param [0, 1]
Alignment_Mode = 3  #@param {type:"integer", min:0, max:3}
Realign = 1  #@param [0, 1]

#@markdown ---
#@markdown ### TreeSAPP Create Settings
Mute_TreeSAPP_Output = True  #@param {type:"boolean"}

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interactive

Extra_Arguments = ""  #@param {type:"string"}

#@markdown ---
#@markdown ### TreeSAPP Assign Settings - None Currently


# ------------------- YAML Generation -------------------
!pip install -q pyyaml
import yaml

config = {
    "cluster_db": {
        "filter_by_sprot": True,
        "chunk_size": Chunk_Size,
    },
    "structure_clustering": {
        "min_cluster_size": Min_StructCluster_Size,
        "max_cluster_size": Max_StructCluster_Size,
    },
    "sequence_clustering": {
        "mute_mmseqs": Mute_MMSeqs_Output,
        "min_cluster_size": Min_SeqCluster_Size,
        "max_cluster_size": Max_SeqCluster_Size,
        "mmseqs_args": [
            f"--min-seq-id {Min_Seq_ID}",
            f"--cov-mode {Cov_Mode}",
            f"-k {Kmer_size}",
            f"--shuffle {Shuffle}",
            f"--remove-tmp-files {Remove_Temp_Files}",
            f"--alignment-mode {Alignment_Mode}",
            f"--realign {Realign}",
        ]
    },
    "treesapp_create": {
        "mute_treesapp": Mute_TreeSAPP_Output,
        "extra_args": Extra_Arguments.split() if Extra_Arguments else [],
    },
    "treesapp_assign": {
        "num_threads": 2
    }
}

# Write YAML to file
with open("config.yaml", "w") as f:
    yaml.dump(config, f, default_flow_style=False)

print("✅ Config file saved as `config.yaml`")

In [None]:
#@title Run TreeSAPP Create
!conda run -n snakemake_env snakemake --use-conda data/hyperpackages/{sample}.refpkg.tar.gz

In [None]:
#@title Install TreeSAPP Assign Dependencies
!conda create -n treesapp_cenv -c bioconda -c conda-forge -y
!conda install -n treesapp_cenv -c bioconda -c conda-forge treesapp -y

In [None]:
#@title TreeSAPP Assign Steps
#@markdown 1. Add your desired fasta file for TreeSAPP assign into `TS-Capstone-2025/data/assign_fastas` directory
#@markdown 2. Input the name of your fasta file (without .fasta extension)
fasta = "geneX" #@param {type:"string"}

In [None]:
#@title Run TreeSAPP Assign

!snakemake --use-conda --cores 2 data/assigned_hyperpackages/{fasta}/{sample}.refpkg.tar.gz

In [None]:
#@title Save Hyperpackage
from google.colab import files
import os
import shutil

# Replace with your folder path
folder_path = "data/hyperpackages"
if os.path.isdir(folder_path):
  shutil.make_archive("hyperpackages", "zip", folder_path)
  filepath = folder_path + ".zip"
  files.download(filepath)

In [None]:
#@title Save Assigned Hyperpackage
from google.colab import files
import os
import shutil

# Replace with your folder path
folder_path = "data/assigned_hyperpackages"
if os.path.isdir(folder_path):
  shutil.make_archive("assigned_hyperpackages", "zip", folder_path)
  filepath = folder_path + ".zip"
  files.download(filepath)

In [None]:
# Create Statistical Analysis