In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

# high-level parameters
seed_init = 123
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
n_exons = 2
block_range = (30, 200)
ins_range = (7, 50)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/fusions'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get valid transcripts
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
available_genes = [gene for gene in all_genes if gene in valid_genes]
fus_info = []

# params dict
params = {'n_exons': n_exons,
          'ins_range': ins_range,
          'block_range': block_range,
          'out_prefix': out_prefix}

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

0

## Generate canonical fusions

Select `n_vars` random genes with `n_vars` random partners and fuse first `n_exons` exons to `n_exons` terminal exons.

In [3]:
%%time

# pick fusion genes
np.random.seed(seed_init)
fus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)
fusions = zip(fus_genes[:n_vars], fus_genes[n_vars:])

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion((tx1, tx2), (gene1, gene2), all_exons,
                                  genome_fasta, params, gene_trees)
    fus_info.append(fus_parts)

Generating NCKAP1L:ALKBH2 fusion...
Generating LOC105369676:FAM216A fusion...
Generating DIABLO:LOC105370063 fusion...
Generating NOP2:SLC6A13 fusion...
Generating MRPS35:LOC105369758 fusion...
Generating WNT10B:ANKRD33 fusion...
Generating ESPL1:DHH fusion...
Generating RNFT2:LINC01559 fusion...
Generating MYF5:YARS2 fusion...
Generating ARL6IP4:NRIP2 fusion...
CPU times: user 2min 17s, sys: 877 ms, total: 2min 18s
Wall time: 2min 20s


## Generate unpartnered fusions

In [4]:
%%time
ufus_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for idx, gene in enumerate(ufus_genes):
    print('Generating %s unpartnered fusion...' % gene)
                                          
    tx = simu.get_transcripts(gene, all_exons, valid_txs)[0]
    fus_parts = simu.write_fusion((tx, None), (gene, None), all_exons,
                                  genome_fasta, params, gene_trees)

    fus_info.append(fus_parts)

Generating NAB2 unpartnered fusion...
Generating SLC2A3 unpartnered fusion...
Generating RPAP3 unpartnered fusion...
Generating RACGAP1 unpartnered fusion...
Generating LOC105369750 unpartnered fusion...
Generating LOC107984516 unpartnered fusion...
Generating LOC105369649 unpartnered fusion...
Generating PIANP unpartnered fusion...
Generating PRR13 unpartnered fusion...
Generating IKZF4 unpartnered fusion...
CPU times: user 1min 24s, sys: 693 ms, total: 1min 25s
Wall time: 1min 26s


## Fusions with extended exon at boundary

In [5]:
%%time
# pick fusion genes
efus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(efus_genes[:n_vars], efus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with extended exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion((tx1, tx2), (gene1, gene2), all_exons,
                                  genome_fasta, params, gene_trees, add='EE')
    fus_info.append(fus_parts)

Generating LOC107984446:CTC-465D4.1 fusion with extended exon...
Generating TAOK3:NANOGNB fusion with extended exon...
Generating RP11-123M21.1:MLEC fusion with extended exon...
Generating CLLU1OS:PRB1 fusion with extended exon...
Generating LOC100507424:LOC105370044 fusion with extended exon...
Generating TCP11L2:CPSF6 fusion with extended exon...
Generating TRHDE:ESYT1 fusion with extended exon...
Generating TCHP:MSI1 fusion with extended exon...
Generating TMEM5-AS1:OVCH1 fusion with extended exon...
Generating HOTAIR:AAAS fusion with extended exon...
CPU times: user 2min 31s, sys: 1.36 s, total: 2min 33s
Wall time: 2min 37s


## Fusions with novel exons at boundary

In [6]:
%%time
# pick fusion genes
nfus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(nfus_genes[:n_vars], nfus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with novel exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion((tx1, tx2), (gene1, gene2), all_exons,
                                  genome_fasta, params, gene_trees, add='NE')
    fus_info.append(fus_parts)

Generating GNB3:PRMT8 fusion with novel exon...
Generating NANOG:PRDM4 fusion with novel exon...
Generating LOC105370079:MRPL51 fusion with novel exon...
Generating LOC102724960:TPH2 fusion with novel exon...
Generating PKP2:IRAK3 fusion with novel exon...
Generating DDX55:LOC105369853 fusion with novel exon...
Generating USP15:LOC105379613 fusion with novel exon...
Generating LOC101928471:KLRF1 fusion with novel exon...
Generating LOC105369669:FAM222A fusion with novel exon...
Generating C2CD5:NDUFA12 fusion with novel exon...
CPU times: user 2min 31s, sys: 1.37 s, total: 2min 32s
Wall time: 2min 36s


## Fusions with insertions at boundary

In [7]:
%%time
# pick fusion genes
ifus_genes, available_genes = simu.pick_genes(n_vars * 2, available_genes)

# make fusion genes
fusions = zip(ifus_genes[:n_vars], ifus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with a boundary insertion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs)[0]

    fus_parts = simu.write_fusion((tx1, tx2), (gene1, gene2), all_exons,
                                  genome_fasta, params, gene_trees, add='INS')
    fus_info.append(fus_parts)

Generating TARBP2:GNPTAB fusion with a boundary insertion...
Generating RP11-900F13.3:MYBPC1 fusion with a boundary insertion...
Generating ATP6V0A2:TMEM116 fusion with a boundary insertion...
Generating LOC105369592:CHPT1 fusion with a boundary insertion...
Generating LOC107983948:LOC107984467 fusion with a boundary insertion...
Generating GLT1D1:TUBA1A fusion with a boundary insertion...
Generating CERS5:LOC105369639 fusion with a boundary insertion...
Generating LOC105370080:LOC105369972 fusion with a boundary insertion...
Generating USP5:NDUFA4L2 fusion with a boundary insertion...
Generating RP3-473L9.4:TBC1D15 fusion with a boundary insertion...
CPU times: user 2min 22s, sys: 1.09 s, total: 2min 23s
Wall time: 2min 24s


## Write background genes

In [8]:
%%time
bg_set, available_genes = simu.pick_genes(n_background_genes, available_genes)
for gene in bg_set:
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_wildtype_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_wildtype_sequence(tx_seq, strand, case_fasta, tx)

CPU times: user 8min, sys: 3.55 s, total: 8min 3s
Wall time: 8min 10s


## Generate reads with Art

In [9]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.78 ms, sys: 11.5 ms, total: 14.3 ms
Wall time: 8.81 s


In [10]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.99 ms, sys: 17.1 ms, total: 21.1 ms
Wall time: 9.75 s


## Write details for fusions simulated

In [11]:
fus_info = pd.DataFrame(fus_info, columns=['loc1', 'gene1', 'tx1', 'insert',
                                           'loc2', 'gene2', 'tx2', 'fusion_type'])
fus_info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')