In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

%load_ext autoreload

In [2]:
# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100

# simulation parameters
indel_range = (7, 50)
exons_range = (1, 3)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [3]:
# references
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/tsvs'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [4]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons and 
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
tsv_info = []

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 11.5 s, sys: 167 ms, total: 11.6 s
Wall time: 11.7 s


## Make deletion variants

In [5]:
%%time
np.random.seed(seed_init)
del_genes = np.random.choice(all_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, del_genes])

for gene in del_genes:
    print('Generating %s deletion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta, 
                                     indel_range, out_prefix, vartype='DEL')

    tsv_info.append([tx, gene, varsize, exon, 'DEL'])

Generating KRT79 deletion...
Generating LOC105369807 deletion...
Generating LOC105369932 deletion...
Generating LINC00987 deletion...
Generating RP5-944M2.2 deletion...
Generating RP11-497G19.1 deletion...
Generating TMEM132C deletion...
Generating KRT6B deletion...
Generating LOC105369701 deletion...
Generating NECAP1 deletion...
CPU times: user 46.5 s, sys: 325 ms, total: 46.8 s
Wall time: 47.4 s


## Make insertion variants

In [6]:
%%time

available_genes = list(set(all_genes).symmetric_difference(var_genes))
ins_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ins_genes])

for gene in ins_genes:
    print('Generating %s insertion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='INS')

    tsv_info.append([tx, gene, varsize, exon, 'INS'])

Generating LOC105369673 insertion...
Generating SLCO1B1 insertion...
Generating SLC2A14 insertion...
Generating LOC107984541 insertion...
Generating OLR1 insertion...
Generating LOC105369755 insertion...
Generating LOC105369593 insertion...
Generating LOC400043 insertion...
Generating RPSAP52 insertion...
Generating PDE6H insertion...
CPU times: user 45.4 s, sys: 288 ms, total: 45.7 s
Wall time: 46.2 s


## Make ITDs

In [7]:
%%time
available_genes = list(set(all_genes).symmetric_difference(var_genes))
itd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, itd_genes])

for gene in itd_genes:
    print('Generating %s ITD...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='ITD')

    tsv_info.append([tx, gene, varsize, exon, 'ITD'])

Generating PA2G4 ITD...
Generating NOP2 ITD...
Generating BRI3BP ITD...
Generating LOC105369930 ITD...
Generating PPHLN1 ITD...
Generating LOC105369811 ITD...
Generating LRP6 ITD...
Generating RP11-121C6.5 ITD...
Generating DUSP6 ITD...
Generating LINC00942 ITD...
CPU times: user 48.4 s, sys: 401 ms, total: 48.8 s
Wall time: 49.5 s


## Make PTDs

In [8]:
# %%time
%autoreload

valid_txs, valid_genes = simu.get_valid_txs(all_exons, 2)
valid_txs = np.unique([tx for tx, gn in valid_txs])

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
ptd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ptd_genes])

for gene in ptd_genes:
    print('Generating %s PTD...' % gene)
    txs = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene]
    tx = [tx for tx in txs if tx in valid_txs][0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='PTD')

    tsv_info.append([tx, gene, n_exons, exon, 'PTD'])

Generating LOC642846 PTD...
Generating ZNF384 PTD...
Generating LOC107984478 PTD...
Generating BTG1 PTD...
Generating LOC107987180 PTD...
Generating LOC105369761 PTD...
Generating LOC100507560 PTD...
Generating IRAK3 PTD...
Generating EP400NL PTD...
Generating LOC105369921 PTD...


## Make Inversions

In [9]:
%%time
%autoreload

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
inv_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, inv_genes])

for gene in inv_genes:
    print('Generating %s inversion...' % gene)
    txs = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene]
    tx = [tx for tx in txs if tx in valid_txs][0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='INV')

    tsv_info.append([tx, gene, n_exons, exon, 'INV'])

Generating LOC105369995 inversion...
Generating LRRC10 inversion...
Generating RP11-575F12.1 inversion...
Generating LOC400002 inversion...
Generating RP11-983C2.2 inversion...
Generating LOC107984457 inversion...
Generating LOC107984522 inversion...
Generating TMEM116 inversion...
Generating ANO4 inversion...
Generating HOXC-AS3 inversion...
CPU times: user 50.2 s, sys: 512 ms, total: 50.7 s
Wall time: 51.6 s


## Generate reads with Art

In [10]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.25 ms, sys: 9.41 ms, total: 11.7 ms
Wall time: 2.38 s


In [11]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.32 ms, sys: 16.2 ms, total: 19.5 ms
Wall time: 2.8 s


In [12]:
info = pd.DataFrame(tsv_info, columns=['tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')