# Cancer Genome Simulation Overview
This notebook provides a pipeline to run insilicoSV sequentially to simulate cancer genomes.
This notebook takes as input a YAML file containing:
  - The paths to several insilicoSV YAML config files.
  - A list of the tumor clones and the sequence of config files to run to obtain the clone genome.
  - The tumor purity of each clone for the simulation of the reads.
Refer to the provided clones.yaml config for an example of the expected syntax. 

## Generate the Clone Genomes

In [None]:
import sys

import yaml
from pysam import VariantFile
from IPython.display import Image
from collections import defaultdict
import os
import subprocess
import shutil
from math import ceil

In [None]:
number_threads = '10'

In [None]:
%%sh
rm -r ./clones/
mkdir -p clones

# download the chr21 reference
wget -O clones/chr21.fa.gz https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr21.fa.gz
gunzip -f clones/chr21.fa.gz

In [None]:
%%sh
# copy the YAML config file
cp ./configs/clones.yaml ./clones/.

# display the config
cat ./clones/clones.yaml

In [None]:
def run_bash_process(command):
    try:
        output = subprocess.run(
        ' '.join(command),
        stdout=subprocess.DEVNULL,
        stderr=subprocess.PIPE,
        shell=True,
        text=True          
        )
        
        if output.stderr:
            print(output.stderr)

    except subprocess.CalledProcessError as e:
        print("Command failed with return code:", e.returncode)
        print(e.stderr)
        raise        

In [None]:
def call_insilicosv(path, config_name):
    command = ['insilicosv', '-c', path + config_name]
    run_bash_process(command)

In [None]:
def merge_insilicosv_output(path):
    merge_command = ['cat', path + 'sim.hapA.fa',  path + 'sim.hapB.fa', '>', path + 'sim.fa']
    run_bash_process(merge_command)

In [None]:
def clonal_genome_generator(folder_path, config_name):
    reads_to_simulate = {}
    vcf_dict = {}
    with open(folder_path + config_name) as config_yaml:
        config = yaml.safe_load(config_yaml)
    reference = config['reference']
    for clone_name, dependencies in config['clones'].items():
        current_path = folder_path 
        previous_vcf_path = ''
        for dependency in dependencies:
            current_path = current_path + '/dependency_' + str(dependency) + '/'
            config_name = config['config_files'][dependency].split('/')[-1]

            if os.path.exists(current_path): 
                previous_vcf_path = current_path + '/sim.vcf' 
                continue
            
            os.makedirs(current_path)
            shutil.copy(config['config_files'][dependency], current_path)
            
            # Append the previous insilicoSV output as new reference or use the initial reference if first call
            with open(current_path + config_name, 'a') as file:
                if previous_vcf_path:
                    file.write('\n  - import: ' + previous_vcf_path + '\n')
                file.write('\nreference: ' + reference + "\n")

            call_insilicosv(current_path, config_name)
            previous_vcf_path = current_path + 'sim.vcf'
            if dependency == dependencies[-1]:
                merge_insilicosv_output(current_path)
            os.remove(current_path + '.hapA.fa')
            os.remove(current_path + '.hapB.fa')
                
        vcf_dict[str(clone_name)] = previous_vcf_path 
        purity = config['purity'][clone_name]
        reads_to_simulate[clone_name] = [current_path, purity]
    global_coverage = config['coverage']
    return reference, global_coverage, vcf_dict, reads_to_simulate

In [None]:
root_path = './clones/'
reference, coverage, vcf_dict, reads_to_simulate = clonal_genome_generator(root_path, 'clones.yaml')

## Read simulation
### Short-read simulation
Below we simulate paired-end short reads at the suited coverage given each clone purity and the requested total coverage using ```DWGSIM```. 

After generating the reads, we align them with minimap2 (short-read mode) and sort the alignments using ```samtools```.

In [None]:
def call_dwgsim(dwgsim_coverage, read_length, platform, genome, output_prefix):
    dwgsim_path = shutil.which('dwgsim')

    command = [dwgsim_path, '-C', str(dwgsim_coverage), '-1', str(read_length), '-2', str(read_length), '-o', platform, '-H', genome, output_prefix]
    run_bash_process(command)

In [None]:
def align_reads(platform, reference, reads, output_name):
    command_align = ['minimap2', '-t', number_threads, '-ax', platform, reference] + reads + ['|', 'samtools', 'sort', '-@', number_threads, '-o', output_name, '-']
    run_bash_process(command_align)
    
    command_index = ['samtools', 'index', '-@', number_threads, output_name]
    run_bash_process(command_index)
    
    command_get_coverage = ['samtools', 'coverage', output_name, '>', output_name + '.coverage']
    run_bash_process(command_get_coverage)
    
    with open(output_name + '.coverage', 'r') as cov_file:
        print('PROCESSED: Clone', output_name, 'coverage/n', cov_file.read())

In [None]:
def merge_clones(output_name, list_bams):
    print('Merging clones...')
    command = ['samtools', 'merge', '-@', number_threads, '-o', output_name] + list_bams
    run_bash_process(command)
    
    print('Indexing reads...')
    command_index = ['samtools', 'index', '-@', number_threads, output_name]
    run_bash_process(command_index)
    
    print('Computing coverage...')
    command_get_coverage = ['samtools', 'coverage', output_name, '>', output_name + '.coverage']
    run_bash_process(command_get_coverage)
    
    with open(output_name + '.coverage', 'r') as cov_file:
        print('PROCESSED: Whole cancer genome', output_name, 'coverage', cov_file.read())

We then merge all the simulated reads to obtain the requested coverage.

In [None]:
read_length = 151
dwgsim_platform = '0' # Illumina
alignment_platform = 'sr'
list_sr_bams = []
for clone_name, (genome_folder, purity) in reads_to_simulate.items():
    print('Simulating reads for clone', clone_name)
    output_prefix = genome_folder + 'sim_sr.dwgsim'
    clone_coverage = ceil(coverage * purity / 100 / 2)
    print('Clone purity', purity, ' Clone coverage', clone_coverage * 2)
    call_dwgsim(clone_coverage, read_length, dwgsim_platform, genome_folder + '/sim.fa', output_prefix)
    
    print('Aligning reads')
    r1 = output_prefix + '.bwa.read1.fastq.gz'
    r2 = output_prefix + '.bwa.read2.fastq.gz'
    output_name = genome_folder + '/' + clone_name + '_sim_sr.dwgsim.bam'
    list_sr_bams.append(output_name)
    reads = [r1, r2]
    align_reads(alignment_platform, reference, reads, output_name)

In [None]:
output_name_sr = root_path + 'cancer_genome_sr.bam'
merge_clones(output_name_sr, list_sr_bams)

### Long-read simulation
We use PBSIM3 to simulate HiFi reads from the synthetic genome. Since PBSIM3 outputs reads for each reference contig, we also combine the reads from the two synthetic haplotypes into a single FASTQ file.

After generating the reads, we align them with minimap2 (HiFi mode) and sort the alignments using ```samtools```.

In [None]:
def call_pbsim(pbsim_coverage, read_length_mean, accuracy_mean, genome, output_prefix):
    conda_prefix = os.environ.get('CONDA_PREFIX')
    command = ['pbsim', '--depth', str(pbsim_coverage), '--genome', genome, '--prefix', output_prefix, '--strategy', 'wgs', '--method', 'qshmm', '--qshmm', conda_prefix + '/data/QSHMM-RSII.model', '--length-mean', read_length_mean, '--accuracy-mean', accuracy_mean]
    run_bash_process(command)
    print('Merging haplotypes')
    merge_command = ['zcat', output_prefix + '*.fq.gz', '>>', output_prefix + '.fastq']
    run_bash_process(merge_command)

In [None]:
read_length_mean = '20000'
accuracy_mean = '0.999'
alignment_platform = 'map-hifi'
print('Simulating long reads at coverage', coverage)
list_lr_bams = []
for clone_name, (genome_folder, purity) in reads_to_simulate.items():
    print('Simulating reads for clone', clone_name)
    clone_coverage = coverage * purity / 100 / 2
    print('Clone purity', purity, 'Clone coverage', clone_coverage * 2)
    output_prefix = genome_folder + 'sim_lr.pbsim'
    call_pbsim(clone_coverage, read_length_mean, accuracy_mean, genome_folder + '/sim.fa', output_prefix)
    
    print('Aligning reads')
    reads = [output_prefix + '.fastq']
    output_name = genome_folder + '/' + clone_name + '_sim_lr.pbsim.bam'
    list_lr_bams.append(output_name)
    align_reads(alignment_platform, reference, reads, output_name)

In [None]:
output_name_lr = root_path + 'cancer_genome_lr.bam'
merge_clones(output_name_lr, list_lr_bams)

## SV Visualization
Below we generate ```samplot``` illustrations for each SV simulated with ```insilicoSV```.

In [None]:
samplot_path = root_path + 'samplot/'
os.makedirs(samplot_path, exist_ok=True)

os.environ["MPLBACKEND"] = "Agg"

for dependency_name, vcf_path in vcf_dict.items():
    vcf = VariantFile(vcf_path)
    rec2breakends = defaultdict(set)
    rec2chrom = defaultdict(str)
    for vcf_rec in vcf.fetch():
        vcf_info = dict(vcf_rec.info)        
        sv_title = "%s_%s_%s" % (vcf_info.get('SVTYPE', 'SNP'), 
                                 vcf_info.get('SVID', vcf_rec.id), 
                                 vcf_info.get('GRAMMAR', '').replace("->", "-to-"))
        rec2breakends[sv_title].add(vcf_rec.start)
        rec2breakends[sv_title].add(vcf_rec.stop)
        if 'TARGET' in vcf_info:
            rec2breakends[sv_title].add(vcf_info['TARGET'])
        rec2chrom[sv_title] = vcf_rec.chrom
    vcf.close()
               
    for sv_title, sv_breakends in rec2breakends.items():
        chrom = rec2chrom[sv_title]
        sv_breakends = sorted(sv_breakends)
        output_file = "%s/%s.png" % (samplot_path, dependency_name + '_' + sv_title)
        start = min(sv_breakends)
        end = max(sv_breakends)
        length = end - start
        wlen = ceil(length + length * 0.05)
        command = ["samplot", "plot", "-n", "Illumina", "HiFi", "-b", output_name_sr, output_name_lr, "-s", str(start), "-e", str(end), "-c", chrom, "-t", sv_title, "-w", str(wlen),
                    "--include_mqual", "0", "--separate_mqual", "1", "-o", output_file]

        run_bash_process(command)

In [None]:
%%sh
ls -l clones/samplot/

We visualize some SVs below.

In [None]:
# Shared by all clones
Image(filename=samplot_path + 'A_DUP_INV_sv15_A-to-aa.png')

In [None]:
# Present in clone D only
Image(filename=samplot_path + 'D_dDUP_sv4_A_-to-A_A.png')

In [None]:
# Shared by all clones
Image(filename=samplot_path + 'E_DUP_Imported_sv8_.png')

In [None]:
# Present in three clones 
Image(filename=samplot_path + 'C_INDEL_Imported_sv10_.png')

In [None]:
# Present in C only
Image(filename=samplot_path + '/C_DEL_sv3_A-to-.png')