In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

In [2]:
# high-level parameters
seed_init = 123
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
n_exons = 2
block_range = (30, 200)
ins_range = (7, 50)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [3]:
# references
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/fusions'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

# cleanup (from older runs)
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)

In [4]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# ensure each transcript in reference has at least N exons
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_txs = [(tx['transcript_id'], simu.get_gene_name(tx)) for tx in all_exons]
valid_txs = pd.DataFrame(pd.Series(all_txs).value_counts(), columns=['exon_count'])
valid_txs = valid_txs[valid_txs.exon_count >= min_exons]
valid_txs = valid_txs.index.values

all_genes = np.unique([gene for tx, gene in valid_txs if gene != ''])
var_genes = np.empty(0)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# params dict
params = {'n_exons': n_exons,
         'block_range': block_range,
         'ins_range': ins_range,
         'out_prefix': out_prefix}

# make outdir
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 11.3 s, sys: 173 ms, total: 11.5 s
Wall time: 11.5 s


## Generate canonical fusions

Select `n_vars` random genes with `n_vars` random partners and fuse first `n_exons` exons to `n_exons` terminal exons.

In [5]:
%%time
# pick fusion genes
np.random.seed(seed_init)
fus_genes = np.random.choice(all_genes, n_vars * 2, replace=False)
fusions = zip(fus_genes[:n_vars], fus_genes[n_vars:])
fus_txs = [] # which fusions were used in the fusion; for reference

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees)
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'canonical_fusion'])
    fus_txs.append(fus_parts)

Generating NCKAP1L:ALKBH2 fusion...
Generating LOC105369676:FAM216A fusion...
Generating DIABLO:LOC105370063 fusion...
Generating NOP2:SLC6A13 fusion...
Generating MRPS35:LOC105369758 fusion...
Generating WNT10B:ANKRD33 fusion...
Generating ESPL1:DHH fusion...
Generating RNFT2:LINC01559 fusion...
Generating MYF5:YARS2 fusion...
Generating ARL6IP4:NRIP2 fusion...
CPU times: user 47.5 s, sys: 489 ms, total: 48 s
Wall time: 49.6 s


## Generate unpartnered fusions

In [6]:
%%time
var_genes = fus_genes.copy()
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ufus_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ufus_genes])

for idx, gene in enumerate(ufus_genes):
    print('Generating %s unpartnered fusion...' % gene)
                                          
    # select first transcript from each gene
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
        
    fus_parts = simu.write_fusion(tx, None, all_exons, genome_fasta, params, gene_trees)
    fus_parts.extend(['(%s:)' % (gene), 'unpartnered_fusion'])
    fus_txs.append(fus_parts) 

Generating ANKS1B unpartnered fusion...
Generating LOC105369701 unpartnered fusion...
Generating CD69 unpartnered fusion...
Generating FAM101A unpartnered fusion...
Generating GABARAPL1 unpartnered fusion...
Generating MYBPC1 unpartnered fusion...
Generating FAM222A-AS1 unpartnered fusion...
Generating CPM unpartnered fusion...
Generating TMEM116 unpartnered fusion...
Generating LOC105369740 unpartnered fusion...
CPU times: user 39.8 s, sys: 536 ms, total: 40.4 s
Wall time: 41.8 s


## Fusions with extended exon at boundary

In [7]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
efus_genes = np.random.choice(available_genes, n_vars*2, replace=False)
var_genes = np.concatenate([var_genes, efus_genes])

# make fusion genes
fusions = zip(efus_genes[:n_vars], efus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with extended exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='EE')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'EE_fusion'])
    fus_txs.append(fus_parts)

Generating ACSS3:LOC102724960 fusion with extended exon...
Generating RP11-446N19.1:LOC100506869 fusion with extended exon...
Generating MGAT4C:LOC574538 fusion with extended exon...
Generating LRP1:KRR1 fusion with extended exon...
Generating LOC107984447:C12orf77 fusion with extended exon...
Generating IGF1:LOC105370066 fusion with extended exon...
Generating TMEM5:RAB3IP fusion with extended exon...
Generating LOC105369639:KRT77 fusion with extended exon...
Generating RP11-314D7.1:SCARB1 fusion with extended exon...
Generating UNG:KIAA1033 fusion with extended exon...
CPU times: user 55.4 s, sys: 697 ms, total: 56.1 s
Wall time: 1min


## Fusions with novel exons at boundary

In [8]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
nfus_genes = np.random.choice(available_genes, n_vars*2, replace=False)
var_genes = np.concatenate([var_genes, nfus_genes])

# make fusion genes
fusions = zip(nfus_genes[:n_vars], nfus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with novel exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='NE')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'NE_fusion'])
    fus_txs.append(fus_parts)

Generating BHLHE41:CDK4 fusion with novel exon...
Generating AEBP2:LOC338797 fusion with novel exon...
Generating CAPRIN2:KLRK1 fusion with novel exon...
Generating LOC107984446:TSPAN19 fusion with novel exon...
Generating TFCP2:E2F7 fusion with novel exon...
Generating LOC105369946:UHRF1BP1L fusion with novel exon...
Generating POLR3B:NRAV fusion with novel exon...
Generating HNF1A:ULK1 fusion with novel exon...
Generating ATP6V0A2:LOC107984484 fusion with novel exon...
Generating ZCCHC8:A2M fusion with novel exon...
CPU times: user 55.5 s, sys: 709 ms, total: 56.3 s
Wall time: 1min


## Fusions with insertions at boundary

In [9]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ifus_genes = np.random.choice(available_genes, n_vars*2, replace=False)
var_genes = np.concatenate([var_genes, ifus_genes])

# make fusion genes
fusions = zip(ifus_genes[:n_vars], ifus_genes[n_vars:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with a boundary insertion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = simu.write_fusion(tx1, tx2, all_exons, genome_fasta, params, gene_trees, add='INS')
    fus_parts.extend(['(%s:%s)' % (gene1, gene2), 'INS_fusion'])
    fus_txs.append(fus_parts)

Generating IAPP:CISTR fusion with a boundary insertion...
Generating RP3-461F17.3:LOC105378258 fusion with a boundary insertion...
Generating LOC105369626:LOC105369980 fusion with a boundary insertion...
Generating TMPO:ALX1 fusion with a boundary insertion...
Generating ERBB3:GNPTAB fusion with a boundary insertion...
Generating GNS:BTG1 fusion with a boundary insertion...
Generating BLOC1S1:LOC107984487 fusion with a boundary insertion...
Generating KRT78:FAIM2 fusion with a boundary insertion...
Generating RP11-983C2.2:SLC38A1 fusion with a boundary insertion...
Generating LOC105369841:ISCU fusion with a boundary insertion...
CPU times: user 47.9 s, sys: 517 ms, total: 48.4 s
Wall time: 50 s


## Write background genes

In [10]:
%%time
# write background genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
bg_set = np.random.choice(available_genes, n_background_genes)
for gene in bg_set:
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_sequence(tx_seq, strand, case_fasta, tx)

CPU times: user 3min 55s, sys: 2.29 s, total: 3min 57s
Wall time: 4min 4s


## Generate reads with Art

In [11]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.65 ms, sys: 10.9 ms, total: 13.5 ms
Wall time: 9.78 s


In [12]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.92 ms, sys: 16.3 ms, total: 20.3 ms
Wall time: 12.3 s


## Write details for fusions simulated

In [13]:
fus_info = pd.DataFrame(fus_txs, columns=['tx1', 'insert', 'tx2', 'fusion', 'type'])
fus_info['gene1'] = fus_info.fusion.apply(lambda x: x.split(':')[0][1:])
fus_info['gene2'] = fus_info.fusion.apply(lambda x: x.split(':')[1][:-1])
del fus_info['fusion']
fus_info.to_csv('%s_simulated_fusions.tsv' % out_prefix, index=False, sep='\t')