#### 实验名称：鉴定eccDNA的所有上游分析pipeline整理
#### 实验目的：将benchmark工作所用的所有上游分析pipeline进行整理（NGS pipeline和TGS pipeline分别以NGS_100X和Nanopore_100X数据为例）
#### 实验时间：2023-07-03
#### 实验人员：罗淞文

In [None]:
# NGS pipeline整理

## Circle-Map pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J circle_map_test_NGS_100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/circle_map_test_NGS_100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/circle_map_test_NGS_100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/liuk0617/miniconda3/etc/profile.d/conda.sh
conda activate Circle-Map
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core
R1fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R1.fq
R2fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R2.fq
reference_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38
bwa_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/bwa/hg38
result_dir=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/result/circle_map_result/NGS100X_01
mkdir $result_dir
cd $result_dir
## Alignment of the reads to the reference genome
bwa mem -t 40 -q $bwa_dir/hg38.fa $R1fastq $R2fastq > simple_ecc.sam
## Preparing the files for Circle-Map
samtools view -h simple_ecc.sam | sed -e '/@HD\tVN:1.5\tSO:unsorted\tGO:query/d' > simple_ecc.sam
samtools sort -@ 40 -n -o qname_simple_ecc.bam simple_ecc.sam
samtools sort -@ 40 -o sorted_simple_ecc.bam simple_ecc.sam
Circle-Map ReadExtractor -i qname_simple_ecc.bam -o circular_read_candidates.bam
samtools sort -@ 40 -o sort_circular_read_candidates.bam circular_read_candidates.bam
## Indexing the BAM files
samtools index sort_circular_read_candidates.bam
samtools index sorted_simple_ecc.bam
## Detecting the circular DNA
Circle-Map Realign -t 40 -i sort_circular_read_candidates.bam -qbam qname_simple_ecc.bam -sbam sorted_simple_ecc.bam -fasta $reference_dir/hg38.fa -o circle_NGS100X_01.bed
echo End time: `date`

## Circle_finder pipeline 整理
#!/bin/sh
#An example for serial job.
#SBATCH -J Circle_Finder_NGS100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/Circle_Finder_NGS100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/Circle_Finder_NGS100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate eccDNA
bash_dir=/home/lifesci/luosongwen/eccDNA/Circle_finder-master
cd /gpfs/home/lifesci/luosongwen/eccDNA/Circle_finder-master/Circle_DNA_NGS100X
echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core
bash $bash_dir/circle_finder-pipeline-bwa-mem-samblaster.sh 40 /home/lifesci/luosongwen/eccDNA/Circle_finder-master/hg38.fa /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R1.fq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R2.fq 10 Circle_final_NGS100X hg38
echo Time is `date`

## ecc_finder_NGS_map pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J ecc_finder_test_NGS100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_NGS100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_NGS100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate ecc_finder
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

R1fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R1.fq
R2fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R2.fq
Reference_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/bwa/hg38
result_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/result/ecc_finder
ecc_finder_path=/home/lifesci/luosongwen/eccDNA/ecc_finder
cd $ecc_finder_path

python ecc_finder.py map-sr $Reference_dir/hg38.fa $R1fastq $R2fastq -r $Reference_dir/hg38.fa -t 40  -o NGS100X_01_final_test_map
echo End time: `date`

## ecc_finder_NGS_asm pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J ecc_finder_test_NGS100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_NGS100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_NGS100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate ecc_finder
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

R1fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R1.fq
R2fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R2.fq
Reference_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/bwa/hg38
result_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/result/ecc_finder
ecc_finder_path=/home/lifesci/luosongwen/eccDNA/ecc_finder
cd $ecc_finder_path

python ecc_finder.py asm-sr $R1fastq $R2fastq -t 40 -o NGS100X_final_asm
echo End time: `date`

## ECCsplorer pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J eccsplorer_test_NGS100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/eccsplorer_test_NGS100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/eccsplorer_test_NGS100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`

#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

cd /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate eccsplorer
python /home/lifesci/luosongwen/miniconda3/envs/eccsplorer/bin/ECCsplorer/ECCsplorer.py -out result/eccsplorer_result/final_NGS100X_01 /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R1.fq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/2nd_data/ratio/NGS100X_01_R2.fq -ref /home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38/hg38.clean.fa -log -m map
echo End time is `date`

In [None]:
# TGS pipeline整理

## CReSIL pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J Cresil_nanopore100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/cresil_nanopore100X_01_test_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/cresil_nanopore100X_01_test_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate cresil
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

##Cresil_eccDNA_test
## Go to an example folder
cd /home/lifesci/luosongwen/eccDNA/TGS_tools/cresil_v2.0/example
## Run trim 
cresil trim -t 40 -fq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq -r reference.mmi -o cresil_Nanopore100X_01_result
## Run eccDNA identification for enriched data
cresil identify -t 40 -fa reference.fa -fai reference.fa.fai -fq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq -trim cresil_Nanopore100X_01_result/trim.txt
#cresil identify_wgls -t 40 -r reference.mmi -fa reference.fa -fai reference.fa.fai -fq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/3rd_simulate_data/WGS/nanoporeWGS100x.fastq -trim cresil_nanoporeWGS100x_result/trim.txt
## Run eccDNA annotation
cresil annotate -t 40 -rp reference.rmsk.bed -cg reference.cpg.bed -gb reference.gene.bed -identify cresil_Nanopore100X_01_result/eccDNA_final.txt
## Run visualize eccDNA (ec1)
#cresil visualize -t 4 -c ec1 -identify cresil_result/eccDNA_final.txt
## Run Circos (ec1)
#cd cresil_result/for_Circos/ec1
#circos -noparanoid -conf circos.conf
echo End time: `date`

## NanoCircle pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J Nanocircle_nanopore100X
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/nanocircle_nanopore100X_test_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/nanocircle_nanopore100X_test_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate cresil
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

cd /home/lifesci/luosongwen/eccDNA/TGS_tools/cresil_nanocircle_v1.0/NanoCircle
minimap2 -t 40 -ax map-ont --secondary=no /home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38/hg38.clean.fa /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq | samtools sort -> /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.bam
samtools index /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.bam
bedtools genomecov -bg -ibam /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.bam | bedtools merge -d 1000 -i stdin | sort -V -k1,1 -k2,2n > /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01_1000_cov.bed
python /home/lifesci/luosongwen/eccDNA/TGS_tools/cresil_nanocircle_v1.0/NanoCircle/NanoCircle_arg.py Circles -i /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01_1000_cov.bed -b /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.bam -o /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01
python /home/lifesci/luosongwen/eccDNA/TGS_tools/cresil_nanocircle_v1.0/NanoCircle/NanoCircle_arg.py Merge -i /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01_Complex.bed -o /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01_complex_merged.bed
echo End time: `date`


## ecc_finder_TGS_map pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J ecc_finder_test_map_nanopore100X
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_map_nanopore100X_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_map_nanopore100X_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate ecc_finder
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

R1fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq
Reference_dir=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38
result_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/result/ecc_finder
ecc_finder_path=/home/lifesci/luosongwen/eccDNA/ecc_finder
cd $ecc_finder_path
python ecc_finder.py map-ont $Reference_dir/hg38.clean.fa $R1fastq -r $Reference_dir/hg38.clean.fa -t 40 -o Nanopore100X_final_map
echo End time: `date`

## ecc_finder_TGS_asm pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J ecc_finder_test_nanopore100X
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_nanopore100X_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/ecc_finder_test_nanopore100X_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate ecc_finder
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

R1fastq=/gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq
Reference_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38
result_dir=/home/lifesci/luosongwen/eccDNA/stimulate_data/result/ecc_finder
ecc_finder_path=/home/lifesci/luosongwen/eccDNA/ecc_finder
cd $ecc_finder_path
python ecc_finder.py asm-ont $R1fastq -t 40 -o Nanopore100X_final_asm
echo End time: `date`

## eccDNA_RCA_nanopore pipeline整理
#!/bin/sh
#An example for serial job.
#SBATCH -J eccDNA_RCA_nanopore100X_01
#SBATCH -o /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/eccDNA_RCA_nanopore100X_01_%j.log
#SBATCH -e /home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/logs/eccDNA_RCA_nanopore100X_01_%j.err
#SBATCH --qos=aepmemqos
#SBATCH --exclusive
#SBATCH --time=240:00:00
#SBATCH -N 1 -n 40 -p 2TB-AEP-Mem
echo Running on $SBATCH_PARTITION paratation
echo Start time is `date`
source /home/lifesci/luosongwen/miniconda3/etc/profile.d/conda.sh
conda activate cresil
#echo Directory is $PWD
echo This job runs on the following nodes:
echo $SLURM_JOB_NODELIST
echo This job has allocated $SLURM_JOB_CPUS_PER_NODE cpu core

cd /home/lifesci/luosongwen/eccDNA/TGS_tools/eccDNA_RCA_nanopore
minimap2 -t 40 -c /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38/hg38.clean.fa /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq > /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.paf
./eccDNA_RCA_nanopore.py --fastq /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.fastq --paf /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/test_data/nanopore_01_data/NanoporeCircle100X_01.num.paf --info nanopore100X_01_num_info.tsv --seq nanopore100X_01_num.fa --var nanopore100X_01_num_var.tsv --reference /gpfs/home/lifesci/luosongwen/eccDNA/stimulate_data/Reference/UCSC/hg38/hg38.clean.fa --verbose | tee out.log
