In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

# high-level parameters
seed_init = 123
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
n_exons = 2
block_range = (30, 200)
indel_range = (7, 50)
exons_range = (1, 3)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
junc_ref_file = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.info'
out_prefix = '/Users/marek.cmero/Desktop/output/allvars'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
# build GTF reference
gr = BedTool(gtf_ref)
junc_ref = simu.build_junc_ref(junc_ref_file)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons for records that have a gene name
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
available_genes = [gene for gene in all_genes if gene in valid_genes]

# params dict
params = {'n_exons': n_exons,
          'ins_range': indel_range,
          'block_range': block_range,
          'out_prefix': out_prefix}

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

np.random.seed(seed_init)

## Fusions

In [3]:
%%time

fuscols = ['loc1', 'gene1', 'tx1', 'insert', 'loc2', 'gene2', 'tx2', 'fusion_type']
fus_genes, available_genes = simu.pick_genes(n_vars * 5, available_genes)
partner_genes, available_genes = simu.pick_genes(n_vars * 4, available_genes)
partner_genes = list(partner_genes) + [''] * n_vars
add = [''] * n_vars \
    + ['EE'] * n_vars \
    + ['NE'] * n_vars \
    + ['INS'] * n_vars \
    + [''] * n_vars

fusions = pd.DataFrame({'gene1': fus_genes,
                        'gene2': partner_genes,
                        'add': add})
fus_info = []

for idx, row in fusions.iterrows():
    vartype = 'canonical' if row['add'] == '' else row['add']
    vartype = 'unpartnered' if row['gene2'] == '' else vartype
    
    print('Generating %s fusion...' % vartype)
    gene1, gene2 = row['gene1'], row['gene2'] if row['gene2'] != '' else None
    tx1 = simu.get_transcripts(gene1, all_exons, valid_txs=valid_txs)[0]
    tx2 = simu.get_transcripts(gene2, all_exons, valid_txs=valid_txs)[0] if gene2 else None

    add = row['add'] if row['add'] != '' else None
    fus_parts = simu.write_fusion((tx1, tx2), (gene1, gene2), all_exons,
                                  genome_fasta, params, gene_trees, add=add)
    fus_info.append(fus_parts)

fus_info = pd.DataFrame(fus_info, columns=fuscols)
fus_info.to_csv('%s_fusions_simulated.tsv' % out_prefix, index=False, sep='\t')

Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating canonical fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating EE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating NE fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating INS fusion...
Generating

## TSVs and splice variants

In [4]:
%%time

var_genes, available_genes = simu.pick_genes(n_vars * 10, available_genes)
vartypes = ['DEL'] * n_vars \
         + ['INS'] * n_vars \
         + ['ITD'] * n_vars \
         + ['PTD'] * n_vars \
         + ['INV'] * n_vars \
         + ['EE'] * n_vars \
         + ['NE'] * n_vars \
         + ['RI'] * n_vars \
         + ['NEJ'] * n_vars \
         + ['US'] * n_vars
varcols = ['loc', 'tx', 'gene', 'size', 'exon', 'vartype']
            
var_df = pd.DataFrame({'gene': var_genes, 'vartype': vartypes})
var_info = []

for idx, row in var_df.iterrows():
    vartype, gene = row['vartype'], row['gene']
    print('Generating %s TSV...' % vartype)
    
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    if vartype in ['DEL', 'INS', 'ITD']:
        varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                         indel_range, out_prefix, vartype=vartype)
    elif vartype in ['PTD', 'INV']:
        varsize, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                             out_prefix, exons_range, vartype=vartype)
    elif vartype == 'NEJ':
        varsize, exon = simu.write_trunc_exons(tx, all_exons, genome_fasta, out_prefix, block_range)
    elif vartype == 'US':
        tx, loc, stats = simu.write_unannot_splice(gene, all_exons, valid_txs, genome_fasta,
                                               out_prefix, junc_ref, gene_trees)
        varsize, exon = stats['varsize'], stats['exon']
    else:
        # NE, EE or RI
        tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                       genome_fasta, out_prefix, 
                                       block_range, gene_trees, vartype=vartype)
        while tx == '':
            # no valid genes, repick
            genes, available_genes = simu.pick_genes(1, available_genes)
            gene = genes[0]
            tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                                   genome_fasta, out_prefix, 
                                                   block_range, gene_trees, vartype=vartype)
        varsize, exon = stats['varsize'], stats['exon']

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    var_info.append([loc, tx, gene, varsize, exon, vartype])

var_info = pd.DataFrame(var_info, columns=varcols)
var_info.to_csv('%s_tsvs_splice_simulated.tsv' % out_prefix, index=False, sep='\t')

Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating DEL TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating INS TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating ITD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating PTD TSV...
Generating INV TSV...
Generating INV TSV...
Generating INV TSV...
Generating INV TSV...
Generating INV TSV...
Generating

## Write background genes

In [5]:
%%time
bg_set, available_genes = simu.pick_genes(n_background_genes, available_genes)
for gene in bg_set:
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_wildtype_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_wildtype_sequence(tx_seq, strand, case_fasta, tx)

CPU times: user 7min 46s, sys: 3.76 s, total: 7min 50s
Wall time: 7min 57s


## Generate reads

In [6]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

# gzip files
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 5.84 ms, sys: 25.1 ms, total: 30.9 ms
Wall time: 33.6 s
