In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

%load_ext autoreload
%autoreload

In [2]:
# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100

# simulation parameters
indel_range = (7, 50)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [3]:
# references
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/tsvs'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [4]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons and 
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
tsv_info = []

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 11.2 s, sys: 171 ms, total: 11.4 s
Wall time: 11.4 s


## Make deletion variants

In [5]:
%%time
%autoreload
np.random.seed(seed_init)
del_genes = np.random.choice(all_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, del_genes])

for gene in del_genes:
    print('Generating %s deletion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta, 
                                     indel_range, out_prefix, vartype='DEL')

    tsv_info.append([tx, gene, varsize, exon, 'DEL'])

Generating KRT79 deletion...
Generating LOC105369807 deletion...
Generating LOC105369932 deletion...
Generating LINC00987 deletion...
Generating RP5-944M2.2 deletion...
Generating RP11-497G19.1 deletion...
Generating TMEM132C deletion...
Generating KRT6B deletion...
Generating LOC105369701 deletion...
Generating NECAP1 deletion...
CPU times: user 45.6 s, sys: 326 ms, total: 45.9 s
Wall time: 46.5 s


In [6]:
%%time
%autoreload
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ins_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ins_genes])

for gene in ins_genes:
    print('Generating %s insertion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='INS')

    tsv_info.append([tx, gene, varsize, exon, 'INS'])

Generating RP11-89H19.2 insertion...
Generating RP11-340F14.6 insertion...
Generating LOC105369823 insertion...
Generating RP11-46H11.3 insertion...
Generating KITLG insertion...
Generating RP11-266K4.14 insertion...
Generating LOC107984547 insertion...
Generating RP11-547C5.1 insertion...
Generating LINC00485 insertion...
Generating HOXC11 insertion...
CPU times: user 45.7 s, sys: 307 ms, total: 46 s
Wall time: 46.6 s


In [7]:
%%time
%autoreload
available_genes = list(set(all_genes).symmetric_difference(var_genes))
itd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, itd_genes])

for gene in ins_genes:
    print('Generating %s ITD...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='ITD')

    tsv_info.append([tx, gene, varsize, exon, 'ITD'])

Generating RP11-89H19.2 ITD...
Generating RP11-340F14.6 ITD...
Generating LOC105369823 ITD...
Generating RP11-46H11.3 ITD...
Generating KITLG ITD...
Generating RP11-266K4.14 ITD...
Generating LOC107984547 ITD...
Generating RP11-547C5.1 ITD...
Generating LINC00485 ITD...
Generating HOXC11 ITD...
CPU times: user 45.8 s, sys: 307 ms, total: 46.1 s
Wall time: 46.7 s


## Generate reads with Art

In [8]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.28 ms, sys: 10.1 ms, total: 12.4 ms
Wall time: 1.21 s


In [9]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.62 ms, sys: 14.4 ms, total: 18.1 ms
Wall time: 1.41 s


In [10]:
info = pd.DataFrame(tsv_info, columns=['tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')