In [None]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

# high-level parameters
seed_init = 123
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
n_exons = 2
block_range = (30, 200)
ins_range = (7, 50)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/fusions'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [None]:
# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get valid transcripts
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
fus_info = []

# params dict
params = {'n_exons': n_exons,
         'block_range': block_range,
         'out_prefix': out_prefix}

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

## Generate canonical fusions

Select `n_vars` random genes with `n_vars` random partners and fuse first `n_exons` exons to `n_exons` terminal exons.

In [None]:
%%time

# pick fusion genes
np.random.seed(seed_init)
fus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)
fusions = zip(fus_genes[:n_vars], fus_genes[n_vars:])

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees)
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'canonical_fusion'])
    fus_info.append(fus_parts)

## Generate unpartnered fusions

In [None]:
%%time
var_genes = fus_genes.copy()
ufus_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for idx, gene in enumerate(ufus_genes):
    print('Generating %s unpartnered fusion...' % gene)
                                          
    # select first transcript from each gene
    tx = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
        
    fus_parts = simu.write_fusion(tx, None, all_exons, genome_fasta, params, gene_trees)
    fus_parts.extend(['(%s:)' % (gene), 'unpartnered_fusion'])
    fus_info.append(fus_parts) 

## Fusions with extended exon at boundary

In [None]:
%%time
# pick fusion genes
efus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(efus_genes[:n_vars], efus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with extended exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='EE')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'EE_fusion'])
    fus_info.append(fus_parts)

## Fusions with novel exons at boundary

In [None]:
%%time
# pick fusion genes
nfus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(nfus_genes[:n_vars], nfus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with novel exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='NE')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'NE_fusion'])
    fus_info.append(fus_parts)

## Fusions with insertions at boundary

In [None]:
%%time
# pick fusion genes
ifus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(ifus_genes[:n_vars], ifus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with a boundary insertion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='INS')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'INS_fusion'])
    fus_info.append(fus_parts)

## Write background genes

In [None]:
%%time
bg_set, available_genes = simu.pick_genes(n_background_genes, available_genes)
for gene in bg_set:
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_wildtype_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_wildtype_sequence(tx_seq, strand, case_fasta, tx)

## Generate reads with Art

In [None]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

In [None]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

## Write details for fusions simulated

In [None]:
fus_info = pd.DataFrame(fus_txs, columns=['tx1', 'insert', 'tx2', 'fusion', 'type'])
fus_info['gene1'] = fus_info.fusion.apply(lambda x: x.split(':')[0][1:])
fus_info['gene2'] = fus_info.fusion.apply(lambda x: x.split(':')[1][:-1])
del fus_info['fusion']
fus_info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')