In [15]:
import tempfile
import numpy as np
import pandas as pd
import collections
import re
import os
import subprocess
import pybedtools
from IPython.core.debugger import set_trace
from intervaltree import Interval, IntervalTree
from pybedtools import BedTool
from Bio import SeqIO

In [16]:
# simulation parameters
SEED_INIT = 123
N_VARS = 10
N_BACKGROUND_GENES = 100
N_EXONS = 2 # fuse first N exons to last N exons
MIN_EXONS = 3
BLOCK_RANGE = (30, 200)
INS_RANGE = (3, 50)
FOLD = 50
FRAG_SIZE = 300
FRAG_SD = 20
READ_LEN = 150

In [17]:
# references
ART_ILLUMINA = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
GENOME_FASTA = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
GTF_REF = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
FASTA_OUT = 'cryptic_variants_simu.fasta'
CONTROL_FASTA_OUT = 'cryptic_variants_control.fasta'
OUTDIR = '/Users/marek.cmero/Desktop/output'

# constants
BASES = list('GCAT')

# cleanup (from older runs)
if os.path.exists(FASTA_OUT):
    os.remove(FASTA_OUT)
if os.path.exists(CONTROL_FASTA_OUT):
    os.remove(CONTROL_FASTA_OUT)

In [18]:
# functions
def get_gene_name(row):
    '''
    Prevents KeyError if gene name missing
    '''
    try:
        return row.attrs['gene_name']
    except KeyError:
        return ''
    
def get_seq(gr):
    '''
    Returns a dictionary of exon sequences
    and the corresponding strand of the transcript
    '''
    block_seqs = gr.sequence(fi=GENOME_FASTA, s=True)
    block_dict = collections.OrderedDict()
    with tempfile.NamedTemporaryFile() as fa_tmp:
        fa_tmp.write(bytes(open(block_seqs.seqfn).read(), 'utf-8'))
        fa_tmp.flush()

        for record in SeqIO.parse(fa_tmp.name, 'fasta'):
            block_dict[record.id] = str(record.seq)

    strand = re.search('\(([-+])\)', next(iter(block_dict.keys())))
    assert strand
    
    return block_dict, strand.group(1)

def write_sequence(seq_dict, strand, output_file, name):
    '''
    Writes sequence dictionary to output file
    '''
    seq = [seq_dict[ex] for ex in seq_dict.keys()]
    seq = seq if strand == '+' else [s for s in reversed(seq)]

    with open(output_file, 'a') as fout:
        fout.write('>%s\n' % name)
        fout.write(''.join(seq) + '\n')
        
def get_chrom_features(chrom, gr):
    '''
    Get all merged intervals on given chromosome
    '''
    chrom_features = gr.filter(lambda x: x.chrom == chrom).merge()    
    chrom_features = [(g.start, g.end) for g in gr]
    
    chrom_tree = IntervalTree()
    [chrom_tree.addi(s, e) for s, e in chrom_features]

    return chrom_tree

def get_gene_features(chroms, gr):
    '''
    Get interval tree for start/ends for 
    each gene on each chromosome
    '''
    gn_ref = pd.DataFrame([(g.chrom, g.start, g.end, get_gene_name(g)) for g in gr])
    aggregator = {1: lambda x: min(x),
                  2: lambda x: max(x)}
    gn_ref = gn_ref.groupby([0, 3], as_index=False, sort=False).agg(aggregator)
    gn_ref = gn_ref[[0, 1, 2, 3]]
    gn_ref.columns = ['chrom', 'start', 'end', 'gene']
    gn_ref = gn_ref[gn_ref.gene!='']

    ref_trees = {}
    for chrom in chroms:
        chr_ref = gn_ref[gn_ref.chrom == chrom]
        ref_tree = IntervalTree()
        for s,e,g in zip(chr_ref['start'].values, chr_ref['end'].values, chr_ref['gene'].values):
            ref_tree.addi(s-1, e, g)
    ref_trees[chrom] = ref_tree
    
    return ref_trees

def get_exon_seq(ex_list, strand, gr, extended=True):
    '''
    Extends given exon, or creates a novel downstream
    exon with random size and returns its sequence. 
    If a reference exon exists that already extends
    or overlaps the given exon, it will extend or place
    the exon past the overlapping exon.
    '''
    # TODO: make sure exon size doesn't extend to the next exon 
    block_size = np.random.randint(BLOCK_RANGE[0], BLOCK_RANGE[1])
    gap_size = 0 if extended else np.random.randint(BLOCK_RANGE[0], BLOCK_RANGE[1])
    
    block = ex_list[1] if strand == '+' else ex_list[0]
    loc = re.compile('[:\-\(\)]').split(block)
    exon_start, exon_end = int(loc[1]), int(loc[2])
        
    start = exon_end + gap_size if strand == '+' else exon_start - gap_size - block_size
    end = start + block_size

    ex = get_chrom_features(loc[0], gr.merge())
    olap = ex.overlap(int(start), int(end))
    if len(olap) > 0:
        coords = list(olap)[0]
        s, e = int(coords[0]), int(coords[1])        
        if extended:
            start = start if strand == '+' else s - block_size
            end = end if strand == '-' else e + block_size
        else:          
            start = e + gap_size if strand == '+' else s - gap_size - block_size            
            end = start + block_size
    
    block_bed = '%s\t%d\t%d\t.\t1\t%s' % (loc[0], start, end, strand)
    block_bt = BedTool(block_bed, from_string=True)
    block_seq, bs = get_seq(block_bt)
    ext_seq = ''.join([bs for bs in block_seq.values()])
    bloc = ''.join([k for k in block_seq.keys()])
    
    return ext_seq, bloc

def increment_seed(seed, amount=1):
    seed += amount
    np.random.seed(seed)
    return seed

def get_random_block(chroms, gene_trees):
    '''
    Get random block sequence for feature
    not overlapping any other genomic features
    '''    
    block_size = np.random.randint(BLOCK_RANGE[0], BLOCK_RANGE[1])
    chrom = np.random.choice(chroms)
    chrom_features = gene_trees[chrom]
    
    chr_range = chr_sizes[('chr%s' % chrom)]
    block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
    block_end = block_start + block_size        
    
    seed = SEED_INIT
    seq = 'N'
    while 'N' in seq:
        # only select sequence if there's no Ns
        while chrom_features.overlaps(block_start, block_end):
            block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
            block_end = block_start + block_size
            
            if chrom_features.overlaps(block_start, block_end):
                seed = increment_seed(seed)

        strand = np.random.choice(['+','-'])
        block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end, strand)
        block_bt = BedTool(block_bed, from_string=True)
        block_seq, bs = get_seq(block_bt)
        seq = ''.join([bs for bs in block_seq.values()])
        
        if 'N' in seq:
            seed = increment_seed(seed)

    return block_seq, seed

def get_random_seq():
    '''
    Generate random insertion sequence
    '''
    ins_size = np.random.randint(INS_RANGE[0], INS_RANGE[1])
    ins = np.random.choice(BASES, ins_size)
    ins = ''.join(ins)
    return ins

def get_tx_seq(tx, all_exons, front=True, wt_out=True):
    '''
    Get fusion sequence of given transcript, returning
    sequence of first N exons for transcript 1 (front=True)
    and N exons for transcript 2 (front=False). By default,
    write out wildtype transcript to control reference file.
    '''
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, s = get_seq(exons)
    ex_list = [ex for ex in tx_seq.keys()]
    
    # pick N 5' exons for tx1 (front) and N 3' exons for tx2 (back)
    if front:
        ex_list = ex_list[:N_EXONS] if s == '+' else ex_list[-N_EXONS:]
    else:
        ex_list = ex_list[-N_EXONS:] if s == '+' else ex_list[:N_EXONS]
    
    # select sequences and reverse order if antisense
    seq = [tx_seq[ex] for ex in ex_list]
    seq = seq if s == '+' else [s for s in reversed(seq)]  
    
    if wt_out:
        write_sequence(tx_seq, s, CONTROL_FASTA_OUT, tx)
    
    return seq, s, ex_list

def write_fusion(tx1, tx2, all_exons, add=None):
    '''
    Get left and right sequences of given transcripts
    corresponding to the first N exons and last N exons of
    transcripts 1 and 2 respectively.
    Automatically writes wild type transcript to CONTROL_FASTA_OUT.
    '''    
    exon_types = ['EE', 'NE', 'INS']
    if add and add not in exon_types:
        raise ValueError('Invalid exon type to add, expected %s' % exon_types)
    
    # get sequence for tx1
    seq1, strand1, ex1_list = get_tx_seq(tx1, all_exons)
    
    # add to fusion list
    fusion_parts = [tx1]

    # extended or novel exon
    ext_seq, bloc = '', ''
    if add == 'EE':
        ext_seq, bloc = get_exon_seq(ex1_list, strand1, all_exons)
    elif add == 'NE':
        ext_seq, bloc = get_exon_seq(ex1_list, strand1, all_exons, extended=False)
    elif add == 'INS':
        ext_seq = get_random_seq()
        bloc = ext_seq
    fusion_parts.append(bloc)

    seq2 = ''
    if tx2:
        # get sequence for tx1
        seq2, strand2, ex2_list = get_tx_seq(tx2, all_exons, front=False)

        # add to fusion list
        fusion_parts.append(tx2)
    else:
        # unpartnered fusion
        block_seq, seed = get_random_block(chroms, gene_trees)
        seq2 = [s for s in block_seq.values()]
        bloc = ''.join([k for k in block_seq.keys()])
        fusion_parts.append(bloc)

    seq = ''.join(seq1 + [ext_seq] + seq2)
    name = '%s:%s:%s' % (tx1, bloc, tx2) if tx2 else '%s:%s' % (tx1, bloc)
    
    # write output
    with open(FASTA_OUT, 'a') as fout:
        fout.write('>%s\n' % name)
        fout.write(seq + '\n')
        
    return fusion_parts

In [19]:
%%time
# build GTF reference
gr = BedTool(GTF_REF) 

# ensure each transcript in reference has at least N exons
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_txs = [(tx['transcript_id'], get_gene_name(tx)) for tx in all_exons]
valid_txs = pd.DataFrame(pd.Series(all_txs).value_counts(), columns=['exon_count'])
valid_txs = valid_txs[valid_txs.exon_count >= MIN_EXONS]
valid_txs = valid_txs.index.values

all_genes = np.unique([gene for tx, gene in valid_txs if gene != ''])
var_genes = np.empty(0)

chr_sizes = pybedtools.chromsizes('hg38')
chroms = np.unique([x.chrom for x in all_exons])

# make gene start/end reference
gene_trees = get_gene_features(chroms, gr)

CPU times: user 11.3 s, sys: 199 ms, total: 11.5 s
Wall time: 11.6 s


## Generate canonical fusions

Select `N_VARS` random genes with `N_VARS` random partners and fuse first `N_EXONS` exons to `N_EXONS` terminal exons.

In [20]:
%%time
# pick fusion genes
np.random.seed(SEED_INIT)
fus_genes = np.random.choice(all_genes, N_VARS * 2, replace=False)
fusions = zip(fus_genes[:N_VARS], fus_genes[N_VARS:])
fus_txs = [] # which fusions were used in the fusion; for reference

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = write_fusion(tx1, tx2, all_exons)
    fus_parts.append('(%s:%s)' % (gene1, gene2))
    fus_txs.append(fus_parts)

Generating NCKAP1L:ALKBH2 fusion...
Generating LOC105369676:FAM216A fusion...
Generating DIABLO:LOC105370063 fusion...
Generating NOP2:SLC6A13 fusion...
Generating MRPS35:LOC105369758 fusion...
Generating WNT10B:ANKRD33 fusion...
Generating ESPL1:DHH fusion...
Generating RNFT2:LINC01559 fusion...
Generating MYF5:YARS2 fusion...
Generating ARL6IP4:NRIP2 fusion...
CPU times: user 49.3 s, sys: 506 ms, total: 49.8 s
Wall time: 51.1 s


## Generate unpartnered fusions

In [21]:
%%time
var_genes = fus_genes.copy()
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ufus_genes = np.random.choice(available_genes, N_VARS, replace=False)
var_genes = np.concatenate([var_genes, ufus_genes])

for idx, gene in enumerate(ufus_genes):
    print('Generating %s unpartnered fusion...' % gene)
                                          
    # select first transcript from each gene
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
        
    fus_parts = write_fusion(tx, None, all_exons)
    fus_parts.append('(%s:)' % (gene))
    fus_txs.append(fus_parts) 

Generating ORAI1 unpartnered fusion...
Generating SMARCD1 unpartnered fusion...
Generating LOC100507195 unpartnered fusion...
Generating SRSF9 unpartnered fusion...
Generating LOC105369724 unpartnered fusion...
Generating KITLG unpartnered fusion...
Generating YBX3 unpartnered fusion...
Generating LOC100130268 unpartnered fusion...
Generating LOC105369980 unpartnered fusion...
Generating LOC102723544 unpartnered fusion...
CPU times: user 23.9 s, sys: 293 ms, total: 24.2 s
Wall time: 25.3 s


## Fusions with extended exon at boundary

In [22]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
efus_genes = np.random.choice(available_genes, N_VARS*2, replace=False)
var_genes = np.concatenate([var_genes, efus_genes])

# make fusion genes
fusions = zip(efus_genes[:N_VARS], efus_genes[N_VARS:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with extended exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = write_fusion(tx1, tx2, all_exons, add='EE')
    fus_parts.append('(%s:%s)' % (gene1, gene2))
    fus_txs.append(fus_parts)

Generating FAM138D:LOC105369968 fusion with extended exon...
Generating LINC01498:LINC01486 fusion with extended exon...
Generating LOC107984516:RILPL2 fusion with extended exon...
Generating GALNT8:LOC105369907 fusion with extended exon...
Generating TBC1D15:LOC100996679 fusion with extended exon...
Generating SLC25A3:PRR13 fusion with extended exon...
Generating LOC107984498:C1RL-AS1 fusion with extended exon...
Generating SCAF11:LOC107987174 fusion with extended exon...
Generating LOC100507424:NRAV fusion with extended exon...
Generating MBD6:RPSAP52 fusion with extended exon...
CPU times: user 55.8 s, sys: 638 ms, total: 56.4 s
Wall time: 1min


## Fusions with novel exons at boundary

In [23]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
nfus_genes = np.random.choice(available_genes, N_VARS*2, replace=False)
var_genes = np.concatenate([var_genes, nfus_genes])

# make fusion genes
fusions = zip(nfus_genes[:N_VARS], nfus_genes[N_VARS:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with novel exon...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = write_fusion(tx1, tx2, all_exons, add='NE')
    fus_parts.append('(%s:%s)' % (gene1, gene2))
    fus_txs.append(fus_parts)

Generating COQ5:LOC105370076 fusion with novel exon...
Generating C12orf80:R3HDM2 fusion with novel exon...
Generating EIF2B1:LOC101928554 fusion with novel exon...
Generating APOBEC1:LOC105369715 fusion with novel exon...
Generating APAF1:RFC5 fusion with novel exon...
Generating GLT8D2:RP11-54A9.1 fusion with novel exon...
Generating LOC105370085:LOC107984435 fusion with novel exon...
Generating TIMELESS:HEBP1 fusion with novel exon...
Generating RP11-341G23.4:BLOC1S1-RDH5 fusion with novel exon...
Generating LOC100506691:LOC105369873 fusion with novel exon...
CPU times: user 55.6 s, sys: 669 ms, total: 56.3 s
Wall time: 1min


## Fusions with insertions at boundary

In [24]:
%%time
# pick fusion genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ifus_genes = np.random.choice(available_genes, N_VARS*2, replace=False)
var_genes = np.concatenate([var_genes, ifus_genes])

# make fusion genes
fusions = zip(ifus_genes[:N_VARS], ifus_genes[N_VARS:])
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion with a boundary insertion...' % (gene1, gene2))

    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]

    fus_parts = write_fusion(tx1, tx2, all_exons, add='INS')
    fus_parts.append('(%s:%s)' % (gene1, gene2))
    fus_txs.append(fus_parts)

Generating LOC105369885:RARG fusion with a boundary insertion...
Generating LOC105369759:YEATS4 fusion with a boundary insertion...
Generating NCAPD2:NEMP1 fusion with a boundary insertion...
Generating FAM186B:KRT6A fusion with a boundary insertion...
Generating UBE3B:BIN2 fusion with a boundary insertion...
Generating LINC01089:ZNF384 fusion with a boundary insertion...
Generating PWP1:LOC107983975 fusion with a boundary insertion...
Generating LOC107984452:DPY19L2 fusion with a boundary insertion...
Generating KRT78:ZNF664-FAM101A fusion with a boundary insertion...
Generating KRT8:XPOT fusion with a boundary insertion...
CPU times: user 47.1 s, sys: 378 ms, total: 47.5 s
Wall time: 48.7 s


## Write background genes

In [25]:
%%time
# write background genes
available_genes = list(set(all_genes).symmetric_difference(var_genes))
bg_set = np.random.choice(available_genes, N_BACKGROUND_GENES)
for gene in bg_set:
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = get_seq(exons)
    write_sequence(tx_seq, strand, CONTROL_FASTA_OUT, tx)
    write_sequence(tx_seq, strand, FASTA_OUT, tx)

CPU times: user 4min 4s, sys: 2.55 s, total: 4min 7s
Wall time: 4min 13s


## Generate reads with Art

In [26]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

subprocess.call(['mkdir', '-p', OUTDIR])

# generate case sample
subprocess.call([ART_ILLUMINA, '-ss', 'HS25', '-i', FASTA_OUT, 
                 '-p', '-l', str(READ_LEN), '-f', str(FOLD), '-m', str(FRAG_SIZE),
                 '-s', str(FRAG_SD), '-rs', str(seeds[0]), '-o', '%s/case_R' % OUTDIR])

# generate control
subprocess.call([ART_ILLUMINA, '-ss', 'HS25', '-i', CONTROL_FASTA_OUT, 
             '-p', '-l', str(READ_LEN), '-f', str(FOLD), '-m', str(FRAG_SIZE),
             '-s', str(FRAG_SD), '-rs', str(seeds[1]), '-o', '%s/control_R' % OUTDIR])

CPU times: user 2.81 ms, sys: 12.8 ms, total: 15.6 ms
Wall time: 11.2 s


In [27]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s/%s_R%d.fastq.gz' % (OUTDIR, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s/%s_R%d.fq' % (OUTDIR, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.53 ms, sys: 20.7 ms, total: 24.2 ms
Wall time: 13.2 s


In [28]:
fus_info = pd.DataFrame(fus_txs, columns=['tx1', 'insert', 'tx2', 'fusion'])
fus_info['gene1'] = fus_info.fusion.apply(lambda x: x.split(':')[0][1:])
fus_info['gene2'] = fus_info.fusion.apply(lambda x: x.split(':')[1][:-1])
fus_info['type'] = ['canonical_fusion'] * N_VARS \
                   + ['unpartnered_fusion'] * N_VARS \
                   + ['EE_fusion'] * N_VARS \
                   + ['NE_fusion'] * N_VARS \
                   + ['INS_fusion'] * N_VARS
del fus_info['fusion']
fus_info.to_csv('simulated_fusions.tsv', index=False, sep='\t')