In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool
from IPython.core.debugger import set_trace

%load_ext autoreload

In [2]:
# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100

# simulation parameters
indel_range = (7, 50)
exons_range = (1, 3)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [3]:
# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/tsvs'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [4]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons and 
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
tsv_info = []

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 12.6 s, sys: 295 ms, total: 12.9 s
Wall time: 13.1 s


## Make deletion variants

In [5]:
%%time
%autoreload
np.random.seed(seed_init)

del_genes = np.random.choice(all_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, del_genes])

for gene in del_genes:
    print('Generating %s deletion...' % gene)
    ex = [ex for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    tx, chrom = ex['transcript_id'], ex.chrom
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta, 
                                     indel_range, out_prefix, vartype='DEL')
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'DEL'])

Generating KRT79 deletion...
Generating LOC105369807 deletion...
Generating LOC105369932 deletion...
Generating LINC00987 deletion...
Generating RP5-944M2.2 deletion...
Generating RP11-497G19.1 deletion...
Generating TMEM132C deletion...
Generating KRT6B deletion...
Generating LOC105369701 deletion...
Generating NECAP1 deletion...
CPU times: user 46.9 s, sys: 356 ms, total: 47.3 s
Wall time: 47.9 s


## Make insertion variants

In [6]:
%%time
%autoreload
np.random.seed(seed_init)

available_genes = list(set(all_genes).symmetric_difference(var_genes))
ins_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ins_genes])

for gene in ins_genes:
    print('Generating %s insertion...' % gene)
    ex = [ex for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    tx, chrom = ex['transcript_id'], ex.chrom
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='INS')
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'INS'])

Generating LHX5-AS1 insertion...
Generating LOC100507424 insertion...
Generating SLC6A15 insertion...
Generating LOC105369895 insertion...
Generating PLA2G1B insertion...
Generating CFAP73 insertion...
Generating WDR66 insertion...
Generating LOC101927058 insertion...
Generating LOC107987178 insertion...
Generating LOC105370003 insertion...
CPU times: user 47.8 s, sys: 407 ms, total: 48.2 s
Wall time: 48.9 s


## Make ITDs

In [7]:
%%time
np.random.seed(seed_init)
available_genes = list(set(all_genes).symmetric_difference(var_genes))
itd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, itd_genes])

for gene in itd_genes:
    print('Generating %s ITD...' % gene)
    ex = [ex for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    tx, chrom = ex['transcript_id'], ex.chrom
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='ITD')
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'ITD'])

Generating RP11-850F7.7 ITD...
Generating LIN7A ITD...
Generating SMARCC2 ITD...
Generating RP11-20D14.6 ITD...
Generating BTBD11 ITD...
Generating PXMP2 ITD...
Generating HBCBP ITD...
Generating RP11-780K2.1 ITD...
Generating FAM186B ITD...
Generating LOC105370018 ITD...
CPU times: user 48 s, sys: 389 ms, total: 48.4 s
Wall time: 49 s


## Make PTDs

In [8]:
%%time
%autoreload
np.random.seed(seed_init)

valid_txs, valid_genes = simu.get_valid_txs(all_exons, 2)
valid_txs = np.unique([tx for tx, gn in valid_txs])

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
ptd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ptd_genes])

for gene in ptd_genes:
    print('Generating %s PTD...' % gene)
    exs = [ex for ex in all_exons if simu.get_gene_name(ex) == gene]
    ex = [ex for ex in exs if ex['transcript_id'] in valid_txs][0]    
    tx, chrom = ex['transcript_id'], ex.chrom
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='PTD')
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, n_exons, exon, 'PTD'])

Generating LOC105369728 PTD...
Generating SLC4A8 PTD...
Generating FAM216A PTD...
Generating LOC105369985 PTD...
Generating DPPA3 PTD...
Generating LOC107984472 PTD...
Generating RP11-756G20.1 PTD...
Generating GNB3 PTD...
Generating DYNLL1 PTD...
Generating ZNF10 PTD...
CPU times: user 50.2 s, sys: 382 ms, total: 50.6 s
Wall time: 51.2 s


## Make Inversions

In [9]:
%%time
%autoreload
np.random.seed(seed_init)

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
inv_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, inv_genes])

for gene in inv_genes:
    print('Generating %s inversion...' % gene)
    exs = [ex for ex in all_exons if simu.get_gene_name(ex) == gene]
    ex = [ex for ex in exs if ex['transcript_id'] in valid_txs][0]    
    tx, chrom = ex['transcript_id'], ex.chrom
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                         out_prefix, exons_range, vartype='INV')
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, n_exons, exon, 'INV'])

Generating LOC105369906 inversion...
Generating LOC101929162 inversion...
Generating GTF2H3 inversion...
Generating NCAPD2 inversion...
Generating LOC105369151 inversion...
Generating LOC105370087 inversion...
Generating CD163L1 inversion...
Generating EIF2B1 inversion...
Generating RP11-277P12.9 inversion...
Generating LOC105370067 inversion...
CPU times: user 47.7 s, sys: 366 ms, total: 48.1 s
Wall time: 48.7 s


## Generate reads with Art

In [10]:
%%time
# generate reads with art illumina
np.random.seed(seed_init)
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.06 ms, sys: 9.58 ms, total: 11.6 ms
Wall time: 2.52 s


In [11]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.48 ms, sys: 15.3 ms, total: 18.7 ms
Wall time: 2.9 s


In [12]:
info = pd.DataFrame(tsv_info, columns=['loc', 'tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')