In [79]:
import tempfile
import numpy as np
import pandas as pd
import collections
import re
import os
import subprocess
import pybedtools
from intervaltree import Interval, IntervalTree
from pybedtools import BedTool
from Bio import SeqIO

In [80]:
# simulation parameters
seed = 123
n_genes_per_type = 10
n_background_genes = 100
n_controls = 5
n_exons = 2 # fuse first N exons to last N exons
min_exons = 3
block_range = (30, 200)
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [81]:
# references
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/art_illumina'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
fasta_out = 'cryptic_variants_simu.fasta'
normal_ref = 'cryptic_variants_control.fasta'
outdir = '/Users/marek.cmero/Desktop/output'

# cleanup (from older runs)
if os.path.exists(fasta_out):
    os.remove(fasta_out)
if os.path.exists(normal_ref):
    os.remove(normal_ref)

In [82]:
# functions
def get_gene_name(row):
    '''
    Prevents KeyError if gene name missing
    '''
    try:
        return row.attrs['gene_name']
    except KeyError:
        return ''
    
def get_seq(gr, fasta):
    '''
    Returns a dictionary of exon sequences
    and the corresponding strand of the transcript
    '''
    block_seqs = gr.sequence(fi=fasta, s=True)
    block_dict = collections.OrderedDict()
    with tempfile.NamedTemporaryFile() as fa_tmp:
        fa_tmp.write(bytes(open(block_seqs.seqfn).read(), 'utf-8'))
        fa_tmp.flush()

        for record in SeqIO.parse(fa_tmp.name, 'fasta'):
            block_dict[record.id] = str(record.seq)

    strand = re.search('\(([-+])\)', next(iter(block_dict.keys())))
    assert strand
    
    return block_dict, strand.group(1)

def write_sequence(seq_dict, strand, output_file, name):
    '''
    Writes sequence dictionary to output file
    '''
    seq = [seq_dict[ex] for ex in seq_dict.keys()]
    seq = seq if strand == '+' else [s for s in reversed(seq)]

    with open(output_file, 'a') as fout:
        fout.write('>%s\n' % name)
        fout.write(''.join(seq) + '\n')
        
def get_chrom_features(chrom, gr):
    '''
    Get all merged intervals on given chromosome
    '''
    chrom_features = gr.filter(lambda x: x.chrom == chrom).merge()    
    chrom_features = [(g.start, g.end) for g in gr]
    
    chrom_tree = IntervalTree()
    [chrom_tree.addi(s, e) for s, e in chrom_features]

    return chrom_tree

def get_gene_features(chroms, gr):
    '''
    Get interval tree for start/ends for 
    each gene on each chromosome
    '''
    gn_ref = pd.DataFrame([(g.chrom, g.start, g.end, get_gene_name(g)) for g in gr])
    aggregator = {1: lambda x: min(x),
                  2: lambda x: max(x)}
    gn_ref = gn_ref.groupby([0, 3], as_index=False, sort=False).agg(aggregator)
    gn_ref = gn_ref[[0, 1, 2, 3]]
    gn_ref.columns = ['chrom', 'start', 'end', 'gene']
    gn_ref = gn_ref[gn_ref.gene!='']

    ref_trees = {}
    for chrom in chroms:
        chr_ref = gn_ref[gn_ref.chrom == chrom]
        ref_tree = IntervalTree()
        for s,e,g in zip(chr_ref['start'].values, chr_ref['end'].values, chr_ref['gene'].values):
            ref_tree.addi(s-1, e, g)
    ref_trees[chrom] = ref_tree
    
    return ref_trees

def get_random_block(chroms, block_range, gene_trees, fasta, seed):
    '''
    Get random block sequence for feature
    not overlapping any other genomic features
    '''    
    block_size = np.random.randint(block_range[0], block_range[1])
    chrom = np.random.choice(chroms)
    chrom_features = gene_trees[chrom]
    
    chr_range = chr_sizes[('chr%s' % chrom)]
    block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
    block_end = block_start + block_size        
    
    seq = 'N'
    while 'N' in seq:
        # only select sequence if there's no Ns
        while chrom_features.overlaps(block_start, block_end):
            seed += 123
            np.random.seed(seed)
            block_start = np.random.randint(chr_range[0], chr_range[1]-block_size)
            block_end = block_start + block_size

        strand = np.random.choice(['+','-'])
        block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end, strand)
        block_bt = BedTool(block_bed, from_string=True)
        block_seq, bs = get_seq(block_bt, fasta)
        seq = ''.join([bs for bs in block_seq.values()])

    return block_seq, seed

In [83]:
%%time
# build GTF reference
gr = BedTool(gtf_ref) 

# ensure each transcript in reference has at least N exons
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_txs = [(tx['transcript_id'], get_gene_name(tx)) for tx in all_exons]
valid_txs = pd.DataFrame(pd.Series(all_txs).value_counts(), columns=['exon_count'])
valid_txs = valid_txs[valid_txs.exon_count >= min_exons]
valid_txs = valid_txs.index.values

all_genes = np.unique([gene for tx, gene in valid_txs if gene != ''])
var_genes = np.empty(0)

chr_sizes = pybedtools.chromsizes('hg38')
chroms = np.unique([x.chrom for x in all_exons])

# make gene start/end reference
gene_trees = get_gene_features(chroms, gr)

CPU times: user 11 s, sys: 179 ms, total: 11.2 s
Wall time: 11.3 s


## Generate canonical fusions

Select `n_genes_per_type` random genes with `n_genes_per_type` random partners and fuse first `n_exons` exons to `n_exons` terminal exons.

In [84]:
%%time
# pick fusion genes
np.random.seed(seed)
fus_genes = np.random.choice(all_genes, n_genes_per_type * 2, replace=False)
fusions = zip(fus_genes[:n_genes_per_type], fus_genes[n_genes_per_type:])
fus_txs = [] # which fusions were used in the fusion; for reference

# make fusion genes
for gene1, gene2 in fusions:
    print('Generating %s:%s fusion...' % (gene1, gene2))
    
    # select first transcript from each gene
    tx1 = [tx for tx, gn in valid_txs if gn == gene1][0]
    tx2 = [tx for tx, gn in valid_txs if gn == gene2][0]
    fus_txs.append((tx1, tx2))
        
    # get sequences
    exons1 = all_exons.filter(lambda x: x['transcript_id'] == tx1).saveas()
    exons2 = all_exons.filter(lambda x: x['transcript_id'] == tx2).saveas()

    tx1_seq, s1 = get_seq(exons1, genome_fasta)
    tx2_seq, s2 = get_seq(exons2, genome_fasta)

    # write wild-type transcripts
    write_sequence(tx1_seq, s1, normal_ref, tx1)
    write_sequence(tx2_seq, s2, normal_ref, tx2)
        
    # pick N 5' exons for transcript 1
    ex1_list = [ex for ex in tx1_seq.keys()]
    ex1_list = ex1_list[:n_exons] if s1 == '+' else ex1_list[-n_exons:]

    # pick N 3' exons for transcript 2
    ex2_list = [ex for ex in tx2_seq.keys()]
    ex2_list = ex2_list[-n_exons:] if s2 == '+' else ex2_list[:n_exons]

    # select sequences and reverse order if antisense
    seq1 = [tx1_seq[ex] for ex in ex1_list]
    seq1 = seq1 if s1 == '+' else [s for s in reversed(seq1)]   

    seq2 = [tx2_seq[ex] for ex in ex2_list]
    seq2 = seq2 if s2 == '+' else [s for s in reversed(seq2)] 

    seq = ''.join(seq1 + seq2)

    # write output
    with open(fasta_out, 'a') as fout:
        fout.write('>%s:%s\n' % (gene1, gene2))
        fout.write(seq + '\n')

Generating NCKAP1L:ALKBH2 fusion...
Generating LOC105369676:FAM216A fusion...
Generating DIABLO:LOC105370063 fusion...
Generating NOP2:SLC6A13 fusion...
Generating MRPS35:LOC105369758 fusion...
Generating WNT10B:ANKRD33 fusion...
Generating ESPL1:DHH fusion...
Generating RNFT2:LINC01559 fusion...
Generating MYF5:YARS2 fusion...
Generating ARL6IP4:NRIP2 fusion...
CPU times: user 49.2 s, sys: 544 ms, total: 49.8 s
Wall time: 51.2 s


## Generate unpartnered fusions

In [85]:
%%time
np.random.seed(seed)

var_genes = fus_genes.copy()
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ufus_genes = np.random.choice(available_genes, n_genes_per_type, replace=False)
var_genes = np.concatenate([var_genes, ufus_genes])

for idx, gene in enumerate(ufus_genes):
    # select first transcript from each gene
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
        
    # get sequence
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, s = get_seq(exons, genome_fasta)

    # write wild-type transcript
    write_sequence(tx_seq, s, normal_ref, tx)
        
    # pick N 5' exons for transcript
    ex_list = [ex for ex in tx_seq.keys()]
    ex_list = ex_list[:n_exons] if s == '+' else ex_list[-n_exons:]
    
    # select sequences and reverse order if antisense
    seq1 = [tx_seq[ex] for ex in ex_list]
    seq1 = seq1 if s == '+' else [s for s in reversed(seq1)]
    
    block_seq, seed = get_random_block(chroms, block_range, gene_trees, genome_fasta, seed)
    seq2 = [s for s in block_seq.values()]
    
    seq = ''.join(seq1 + seq2)
    bloc = ''.join([k for k in block_seq.keys()])
    fus_txs.append((tx, bloc))
    
    print('Generating %s:%s unpartnered fusion...' % (gene, bloc))    
    # write output
    with open(fasta_out, 'a') as fout:
        fout.write('>%s:%s\n' % (gene, bloc))
        fout.write(seq + '\n')

Generating CAPZA3:12:65977522-65977661(+) unpartnered fusion...
Generating EEA1:12:28596154-28596187(-) unpartnered fusion...
Generating SYT1:12:118813329-118813521(+) unpartnered fusion...
Generating SNRPF:12:83194082-83194240(+) unpartnered fusion...
Generating RP11-446N19.1:12:88615334-88615413(+) unpartnered fusion...
Generating MGST1:12:65690383-65690574(-) unpartnered fusion...
Generating ATG101:12:65135549-65135734(+) unpartnered fusion...
Generating LOC105379613:12:52503345-52503375(+) unpartnered fusion...
Generating LOC105369735:12:55220154-55220207(+) unpartnered fusion...
Generating RSRC2:12:43569701-43569897(-) unpartnered fusion...
CPU times: user 23.8 s, sys: 318 ms, total: 24.2 s
Wall time: 25.3 s


In [86]:
# transcript IDs used for cryptic variants
fus_txs

[('CHS.11800.1', 'CHS.12716.4'),
 ('CHS.11139.1', 'CHS.12756.5'),
 ('CHS.13049.18', 'CHS.13194.5'),
 ('CHS.10783.2', 'CHS.10619.8'),
 ('CHS.11278.7', 'CHS.11546.4'),
 ('CHS.11540.2', 'CHS.11645.2'),
 ('CHS.11726.4', 'CHS.11551.1'),
 ('CHS.12898.3', 'CHS.11083.1'),
 ('CHS.12272.1', 'CHS.11365.5'),
 ('CHS.13078.4', 'CHS.10681.7'),
 ('CHS.11148.7', '12:65977522-65977661(+)'),
 ('CHS.12410.4', '12:28596154-28596187(-)'),
 ('CHS.12254.14', '12:118813329-118813521(+)'),
 ('CHS.12477.4', '12:83194082-83194240(+)'),
 ('CHS.11458.28', '12:88615334-88615413(+)'),
 ('CHS.11132.23', '12:65690383-65690574(-)'),
 ('CHS.11653.4', '12:65135549-65135734(+)'),
 ('CHS.13178.1', '12:52503345-52503375(+)'),
 ('CHS.11400.1', '12:55220154-55220207(+)'),
 ('CHS.13059.12', '12:43569701-43569897(-)')]

In [87]:
%%time
# write background genes
np.random.seed(seed)
available_genes = list(set(all_genes).symmetric_difference(var_genes))
bg_set = np.random.choice(available_genes, n_background_genes)
for gene in bg_set:
    tx = [tx for tx, gn in valid_txs if gn == gene][0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = get_seq(exons, genome_fasta)
    write_sequence(tx_seq, strand, normal_ref, tx)
    write_sequence(tx_seq, strand, fasta_out, tx)

CPU times: user 3min 51s, sys: 2.1 s, total: 3min 53s
Wall time: 3min 59s


In [88]:
%%time
# generate reads with art illumina
np.random.seed(seed)
seeds = np.random.randint(0, 99999, 2)

subprocess.call(['mkdir', '-p', outdir])

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', fasta_out, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s/case_R' % outdir])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', normal_ref, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s/control_R' % outdir])

CPU times: user 2.86 ms, sys: 16 ms, total: 18.8 ms
Wall time: 6.53 s


In [89]:
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s/%s_R%d.fastq.gz' % (outdir, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s/%s_R%d.fq' % (outdir, sample, (r+1))], stdout=outf)
        outf.close()