In [16]:
import tempfile
import numpy as np
import pandas as pd
import collections
import re
import os
import subprocess
import pybedtools
from intervaltree import Interval, IntervalTree
from pybedtools import BedTool
from Bio import SeqIO

In [17]:
# simulation parameters
seed = 123
n_genes_per_type = 10
n_background_genes = 100
n_controls = 5
n_exons = 2 # fuse first N exons to last N exons
min_exons = 3
block_range = (20, 200)
fold = 50
frag_size = 300
frag_sd = 20

In [18]:
# references
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/art_illumina'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
fasta_out = 'cryptic_variants_simu.fasta'
normal_ref = 'cryptic_variants_control.fasta'
outdir = '/Users/marek.cmero/Desktop/output'

# cleanup (from older runs)
if os.path.exists(fasta_out):
    os.remove(fasta_out)
if os.path.exists(normal_ref):
    os.remove(normal_ref)

In [19]:
# functions
def get_gene_name(row):
    '''
    Prevents KeyError if gene name missing
    '''
    try:
        return row.attrs['gene_name']
    except KeyError:
        return ''
    
def get_seq(gr, fasta):
    '''
    Returns a dictionary of exon sequences
    and the corresponding strand of the transcript
    '''
    block_seqs = gr.sequence(fi=fasta, s=True)
    block_dict = collections.OrderedDict()
    with tempfile.NamedTemporaryFile() as fa_tmp:
        fa_tmp.write(bytes(open(block_seqs.seqfn).read(), 'utf-8'))
        fa_tmp.flush()

        for record in SeqIO.parse(fa_tmp.name, 'fasta'):
            block_dict[record.id] = str(record.seq)

    strand = re.search('\(([-+])\)', next(iter(block_dict.keys())))
    assert strand
    
    return block_dict, strand.group(1)

def write_sequence(seq_dict, strand, output_file, name):
    '''
    Writes sequence dictionary to output file
    '''
    seq = [seq_dict[ex] for ex in seq_dict.keys()]
    seq = seq if strand == '+' else [s for s in reversed(seq)]

    with open(output_file, 'a') as fout:
        fout.write('>%s\n' % name)
        fout.write(''.join(seq) + '\n')
        
def get_chrom_features(chrom, gr):
    '''
    Get all merged intervals on given chromosome
    '''
    chrom_features = gr.filter(lambda x: x.chrom == chrom).merge()    
    chrom_features = [(g.start, g.end) for g in gr]
    
    chrom_tree = IntervalTree()
    [chrom_tree.addi(s, e) for s, e in chrom_features]

    return chrom_tree

def get_random_block(chroms, block_range, gr, fasta):
    '''
    Get random block sequence for feature
    not overlapping any other genomic features
    '''    
    block_size = np.random.randint(block_range[0], block_range[1])
    chrom = np.random.choice(chroms)
    chrom_features = get_chrom_features(chrom, gr)
    
    chr_range = chr_sizes[('chr%s' % chrom)]
    block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
    block_end = block_start + block_size        
    
    while chrom_features.overlaps(block_start, block_end):
        block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
        block_end = block_start + block_size
    
    strand = np.random.choice(['+','-'])
    block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end, strand)
    block_bt = BedTool(block_bed, from_string=True)
    block_seq, bs = get_seq(block_bt, fasta)
    
    return block_seq

In [20]:
%%time
# build GTF reference
gr = BedTool(gtf_ref) 

# ensure each transcript in reference has at least N exons
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_txs = [(tx['transcript_id'], get_gene_name(tx)) for tx in all_exons]
valid_txs = pd.DataFrame(pd.Series(all_txs).value_counts(), columns=['exon_count'])
valid_txs = valid_txs[valid_txs.exon_count >= min_exons]
valid_txs = valid_txs.index.values

all_genes = np.unique([gene for tx, gene in valid_txs if gene != ''])
var_genes = np.empty(0)

chr_sizes = pybedtools.chromsizes('hg38')
chroms = np.unique([x.chrom for x in all_exons])

CPU times: user 9.21 s, sys: 257 ms, total: 9.47 s
Wall time: 9.56 s


## Generate canonical fusions

Select `n_genes_per_type` random genes with `n_genes_per_type` random partners and fuse first `n_exons` exons to `n_exons` terminal exons.

In [21]:
%%time
# pick fusion genes
np.random.seed(seed)
fus_genes = np.random.choice(all_genes, n_genes_per_type * 2, replace=False)
fusions = zip(fus_genes[:n_genes_per_type], fus_genes[n_genes_per_type:])
fus_txs = [] # which fusions were used in the fusion; for reference

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))
    
    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]
    fus_txs.append((tx1, tx2))
        
    # get sequences
    exons1 = all_exons.filter(lambda x: x['transcript_id'] == tx1).saveas()
    exons2 = all_exons.filter(lambda x: x['transcript_id'] == tx2).saveas()

    tx1_seq, s1 = get_seq(exons1, genome_fasta)
    tx2_seq, s2 = get_seq(exons1, genome_fasta)

    # write wild-type transcripts
    write_sequence(tx1_seq, s1, normal_ref, tx1)
    write_sequence(tx2_seq, s2, normal_ref, tx2)
        
    # pick N 5' exons for transcript 1
    ex1_list = [ex for ex in tx1_seq.keys()]
    ex1_list = ex1_list[:n_exons] if s1 == '+' else ex1_list[-n_exons:]

    # pick N 3' exons for transcript 2
    ex2_list = [ex for ex in tx2_seq.keys()]
    ex2_list = ex2_list[-n_exons:] if s2 == '+' else ex2_list[:n_exons]

    # select sequences and reverse order if antisense
    seq1 = [tx1_seq[ex] for ex in ex1_list]
    seq1 = seq1 if s1 == '+' else [s for s in reversed(seq1)]   

    seq2 = [tx2_seq[ex] for ex in ex2_list]
    seq2 = seq2 if s2 == '+' else [s for s in reversed(seq2)] 

    seq = ''.join(seq1 + seq2)

    # write output
    with open(fasta_out, 'a') as fout:
        fout.write('>%s:%s\n' % (gene1, gene2))
        fout.write(seq + '\n')

Generating NCKAP1L:ALKBH2 fusion...
Generating LOC105369676:FAM216A fusion...
Generating DIABLO:LOC105370063 fusion...
Generating NOP2:SLC6A13 fusion...
Generating MRPS35:LOC105369758 fusion...
Generating WNT10B:ANKRD33 fusion...
Generating ESPL1:DHH fusion...
Generating RNFT2:LINC01559 fusion...
Generating MYF5:YARS2 fusion...
Generating ARL6IP4:NRIP2 fusion...
CPU times: user 1min 3s, sys: 1.16 s, total: 1min 5s
Wall time: 1min 6s


## Generate unpartnered fusions

In [22]:
%%time
np.random.seed(seed)

var_genes = fus_genes.copy()
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ufus_genes = np.random.choice(available_genes, n_genes_per_type, replace=False)
var_genes = np.concatenate([var_genes, ufus_genes])

for idx, gene in enumerate(ufus_genes):
    print('Generating %s unpartnered fusion...' % gene)
    
    # select first transcript from each gene
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
    fus_txs.append((tx1,))
        
    # get sequence
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, s = get_seq(exons, genome_fasta)

    # write wild-type transcript
    write_sequence(tx_seq, s, normal_ref, tx)
        
    # pick N 5' exons for transcript
    ex_list = [ex for ex in tx_seq.keys()]
    ex_list = ex_list[:n_exons] if s == '+' else ex_list[-n_exons:]
    
    # select sequences and reverse order if antisense
    seq1 = [tx_seq[ex] for ex in ex_list]
    seq1 = seq1 if s1 == '+' else [s for s in reversed(seq1)]
    
    block_seq = get_random_block(chroms, block_range, gr, genome_fasta)
    seq2 = [s for s in block_seq.values()]
    
    seq = ''.join(seq1 + seq2)
    bloc = ''.join([k for k in block_seq.keys()])
    
    # write output
    with open(fasta_out, 'a') as fout:
        fout.write('>%s:%s\n' % (gene, bloc))
        fout.write(seq + '\n')

Generating LOC105369907 unpartnered fusion...
Generating PCED1B unpartnered fusion...
Generating AGAP2 unpartnered fusion...
Generating LOC101928731 unpartnered fusion...
Generating PMEL unpartnered fusion...
Generating VDR unpartnered fusion...
Generating CHST11 unpartnered fusion...
Generating CERS5 unpartnered fusion...
Generating LOC105369623 unpartnered fusion...
Generating LOC105369677 unpartnered fusion...
CPU times: user 1min 48s, sys: 1.86 s, total: 1min 50s
Wall time: 1min 52s


In [26]:
%%time
# write background genes
np.random.seed(seed)
available_genes = list(set(all_genes).symmetric_difference(var_genes))
bg_set = np.random.choice(available_genes, n_background_genes)
for gene in bg_set:
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = get_seq(exons, genome_fasta)
    write_sequence(tx_seq, strand, normal_ref, tx)
    write_sequence(tx_seq, strand, fasta_out, tx)

CPU times: user 4min 50s, sys: 4.64 s, total: 4min 54s
Wall time: 5min 2s


In [29]:
%%time
# generate reads with art illumina
np.random.seed(seed)
seeds = np.random.randint(0, 99999, 2)

subprocess.call(['mkdir', '-p', outdir])

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', fasta_out, 
                 '-p', '-l', '100', '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s/case_R' % outdir])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', normal_ref, 
             '-p', '-l', '100', '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s/control_R' % outdir])

CPU times: user 3.51 ms, sys: 15.1 ms, total: 18.6 ms
Wall time: 12.4 s
