# Set Up
- access to HPC system of CAU
- set up conda environments for the bioinformatic pipelines


## Installation of SSH client
## Log-in steps
- Bitvise SSH Client
- Host: nesh-login.rz.uni-kiel.de
- Port: 22
- Username: smomw681
- Initial method: password
- Enter password and log in 
-> access to SFTP and terminal directly connected the HPC system


In [None]:
# Alternatively:
ssh -X smomw681@nesh-login.rz.uni-kiel.de
login 

If there is any problem:  hpcsupport@rz.uni-kiel.de

The oparating system is Rocky Linux 8.6 with default system complier gcc v8.5.0

After Acoount_login: 

$WORK quota set to 10 TB (hard), 9 TB (soft) 

Absolute path to my storage: /gxfs_work/geomar/smomw681
Relative path: $WORK


Before doing anything else, load gcc complier and conda and/or python modules:

In [None]:
cd $WORK
module load gcc12-env/12.3.0
module load miniconda3/24.11.1

# show all available software modules
module all

# show all currently available modules in the currently active environment
module avail

# some possibly useful modules
module load cluster-env/default
module load matlab-geomar/R2023a

In [None]:
## How to use python env
module load gcc12-env/12.3.0
module load python/3.11.5
# create a python environment
mkdir $HOME/my_python_env
python -m venv $HOME/my_python_env/my_env
# install package into env
source $HOME/my_python_env/my_env/bin/activate
module load gcc/12.3.0
pip install ...
deactivate
# Use the installed package 
module load gcc12-env/12.3.0
source $HOME/my_python_env/my_env/bin/activate
module load python/3.11.5
...
deactivate

## Access control 


In [None]:
ls -l /gxfs_work/<account to access>/
ls -l /gxfs_work/geomar | grep <account to access>
getfacl /gxfs_work/geomar/<account to access>/

setfacl -m u:<me> /gxfs_work/geomar/<account to access>
getfacl /gxfs_work/geomar/<account to access>/

# assess size of folders
du -sh dir_path
# assess size of files
ls -lh file_path

## Set up of required environment
The conda package is already installed in the working directory of the smomw681. 
Conda environments for each of the pipeline are required, containing following bioconda, other packages and modules:



In [None]:
# For each and every enviroments: 
    module load gcc12-env/12.3.0
    module load miniconda3/24.11.1
    module load parallel/20230822
    module load boost/1.83.0
    module load gcc/12.3.0
    module load gsl/2.7.1
    module load nano/7.2

# list of my conda environments:
conda env list

# list of packages in the environment
conda list -n myenv

#update every packages in the environment
conda update --all -y

# Preprocessing: 
    conda create --name FastqDump -y
    conda activate FastqDump
    # basic modules loaded
    conda install bioconda::sra-tools=3.2.0 -y
    conda install bioconda::parallel-fastq-dump=0.6.7 -y 
    conda install conda-forge::parallel=20241222 -y
    conda install conda-forge::ncbi-datasets-cli=16.40.1 -y
    conda install bioconda::ncbi-genome-download=0.3.3 -y

# Assembly
    conda create --name Assembly -y 
    conda activate Assembly    
    conda install conda-forge::libgcc-ng=14.2.0     # for the current version of spades and bbmap
    conda install bioconda::spades=4.0.0 -y   #or v4.1.0
    conda install bioconda::trimmomatic=0.39 -y  
    conda install bioconda::bbmap # v39.18        
    conda install conda-forge::pigz=2.8 -y
    
# PacBio_Assembly
    conda create -n PacBio_Assembly
    conda activate PacBio_Assembly
    conda install bioconda::wtdbg=2.5 -y
    conda install bioconda::quast=5.3.0 -y
    conda install bioconda::flye=2.9.5 -y
    conda install bioconda::canu=2.3 -y
    conda install bioconda::unicycler=0.5.1 -y
    conda install bioconda::fastqc=0.12.1 -y 
    conda install bioconda::multiqc=1.27.1 -y
    conda install conda-forge::parallel=20241222 -y
    conda install bioconda::trycycler=0.5.5 -y
    conda install bioconda::filtlong=0.2.1 -y
    conda install bioconda::bandage=0.8.1 -y
    # due to mash dependencies error, trycycler package is installed in separate env

# Trycycler (raven, flye, hifiasm)
    conda create -c bioconda -c conda-forge -n trycycler trycycler # v0.5.5
    conda install bioconda::raven-assembler=1.8.3
    conda install -c bioconda pbmm2 pb-assembly pbbam # v1.17.0, v0.0.7, v2.4.0
    conda install bioconda::any2fasta=0.4.2 
    conda install bioconda::hifiasm=0.25.0
    conda install bioconda::hifiasm_meta=v0.3.2
    conda install bioconda::flye=2.9.5
    conda install bioconda::minipolish=0.1.3

# MAG prep and after
    conda create -n MAG
    conda activate MAG
    #conda install bioconda::deepmicroclass=1.0.3 -y 
    conda install python=3.12.9 -y
    pip install DeepMicroClass # without pytorch: use CPU only
    pip install requests #v2.32.3
    conda install bioconda::checkm-genome -y #=1.2.3
    conda install bioconda::prodigal -y #=2.6.3
    conda install bioconda::pprodigal -y #=1.0.1
    conda install bioconda::gtdbtk -y #=2.4.0
    # db for gtdbtk also downloaded: run script 3_0 
    conda install bioconda/label/cf201901::mash # v2.3

# MAG_construction and check
    conda create -n METABAT2
    conda activate METABAT2
    conda install bioconda::metabat2=2.17
    # metabat2 must be installed anew according to the description on the official github site due to negative coverage problem
    # https://bitbucket.org/berkeleylab/metabat/src/master/INSTALL.md
    module load gcc12-env/12.3.0
    module load gcc/12.3.0
    module load boost/1.83.0
    module load cmake/3.27.4
    conda install bioconda::checkm2=1.1.0
    # checkm2 database in /gxfs_work/geomar/smomw681/.conda/envs/METABAT2/checkm_data/
    # or download it to a custom directory and set as environmental variable
        checkm2 database --download --path  /gxfs_work/geomar/smomw681/DATABASES/CheckM_db/
        export CHECKM2DB="/gxfs_work/geomar/smomw681/DATABASES/CheckM_db/CheckM2_database"
        conda env config vars set CHECKM2DB="/gxfs_work/geomar/smomw681/DATABASES/CheckM_db/CheckM2_database"
    conda install bioconda::coverm=0.7.0
    conda install bioconda::drep # 
    # install dependencies of drep
    conda install bioconda/label/cf201901::mash # v2.1
    module load boost/1.83.0
    module load gcc/12.3.0
    module load gsl/2.7.1
    conda install bioconda::centrifuge -y # v1.0.4.2
    # conda install bioconda/label/cf201901::fastani -y # v1.1
        # But couldn't solve the dependency problem of fastANI. conflicting with gsllib
        # made a separate environment because of dependency problem
    
    

# HiFi-MAG MAG_construction (might be better method next time)
    conda create -n HiFi_MAG 
    conda activate HiFi_MAG
    conda install bioconda::snakemake


# dRep 
    conda create -n dRep
    conda activate dRep
    module load boost/1.83.0
    module load gcc/12.3.0
    module load gsl/2.7.1
    conda install bioconda/label/cf201901::mash # v2.1
    conda install bioconda::centrifuge -y # v1.0.4.2
    conda install bioconda/label/cf201901::fastani -y # v1.1
    conda install bioconda::checkm2 -y # v1.1.0
    conda install bioconda::drep # v3.5.0
    # it finally worked! Install the dependencies first and then the main package

#GTDB-TK
    conda create -n GTDBTK bioconda::gtdbtk #v2.4.0
    conda activate GTDBTK
    download-db.sh  # or alternatively copy it from the conda env "MAG
    cd /gxfs_work/geomar/smomw681/DATABASES/
    mkdir GTDBTK_db
    cp -R /gxfs_work/geomar/smomw681/.conda/envs/MAG/share/gtdbtk-2.4.0/db GTDBTK_db/
    # conda env config vars set GTDBTK_DATA_PATH="/gxfs_work/geomar/smomw681/DATABASES/GTDBTK_db/db"
    conda install bioconda::mash=2.3 
    # libgsl causes some dependency problem, so create symbolic link for libgsl.25.0
    ln -s /gxfs_work/geomar/smomw681/.conda/envs/GTDBTK/lib/libgsl.so.27 /gxfs_work/geomar/smomw681/.conda/envs/GTDBTK/lib/libgsl.so.25
    ls -l /gxfs_work/geomar/smomw681/.conda/envs/GTDBTK/lib/libgsl.so.25
    # lrwxrwxrwx 1 smomw681 smomw 62 Apr  4 03:37 /gxfs_work/geomar/smomw681/.conda/envs/GTDBTK/lib/libgsl.so.25 -> /gxfs_work/geomar/smomw681/.conda/envs/GTDBTK/lib/libgsl.so.27


# AntiSMASH
    conda create --name AntiSMASH
    conda activate AntiSMASH
    conda install conda-forge::scikit-learn=1.6.1 -y
    conda install conda-forge::icu=73.2 -y
       # - current : 75.1 # for bioconda::meme
    conda install python=3.10
       # - current : 3.13.1
    conda install bioconda::meme=5.8.5 -y
    conda install hmmer2 hmmer diamond fasttree prodigal blast
    conda install bioconda::antismash -y

    # Previous error with installation of AntiSMASH: tt didn't work so started from the scratch
    conda remove -n ENV_NAME --all
    conda create --name AntiSMASH
    conda activate AntiSMASH
    #load modules
    conda install bioconda::antismash -y 
    # error: Could not solve for environment specs
The following packages are incompatible
└─ antismash is not installable because there are no viable options
   ├─ antismash [4.0.1|4.0.2|4.1.0] would require
   │  └─ scikit-learn 0.18.2 , which does not exist (perhaps a missing channel);
   ├─ antismash 4.2.0 would require
   │  └─ scikit-learn 0.18.* , which does not exist (perhaps a missing channel);
   ├─ antismash 4.2.0 would require
   │  └─ icu >=64.2,<65.0a0 , which does not exist (perhaps a missing channel);
   ├─ antismash [5.1.1|5.1.2|...|6.1.1] would require
   │  └─ pyscss, which does not exist (perhaps a missing channel);
   └─ antismash 7.1.0 would require
      └─ libsass, which does not exist (perhaps a missing channel).    
    conda install conda-forge::scikit-learn=1.6.1 -y 
    # but removed it because it isn't recognized
    conda install conda-forge::libsass=0.22.0 -y
    conda install conda-forge/label/python_rc::_python_rc -y
    conda install python=3.10 -y
    conda install bioconda::antismash=7.1.0 -y 
    download-antismash-databases
    conda install hmmer2 hmmer diamond fasttree prodigal blast
    # antismash my_input.gbk # for use
    # install multismash for ARG statistics
    git clone https://github.com/zreitz/multismash.git  /gxfs_work/geomar/smomw681/.conda/envs/AntiSMASH/bin/
    cd  /gxfs_work/geomar/smomw681/.conda/envs/AntiSMASH/bin/multismash
    pip install . --user

# deepbgc 
    conda create -n DeepBGC python
    conda install bioconda::hmmer
    conda install bioconda::prodigal
    pip install deepbgc # v0.1.31
    deepbgc download    # download the database


# Deep ARG
    conda create --name DeepARG
    conda activate DeepARG
    conda install bioconda::deeparg=1.0.4 -y
    conda install bioconda::diamond
    pip install git+https://github.com/gaarangoa/deeparg.git
    deeparg download_data -o "/gxfs_work/geomar/smomw681/DATABASES/DeepARG"
    pip install deepbgc # alternatively with conda install bioconda::deepbgc=0.1.31, but warning on official site
    deepbgc download



# Check where a package/module is coming from:
which

# To exit from the module:
conda deactivate