In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool

%load_ext autoreload

In [2]:
# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100

# simulation parameters
indel_range = (7, 50)
exons_range = (1, 3)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

In [3]:
# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/tsvs'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [4]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons and 
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
tsv_info = []

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 13.1 s, sys: 299 ms, total: 13.4 s
Wall time: 13.6 s


## Make deletion variants

In [5]:
%%time
np.random.seed(seed_init)
del_genes = np.random.choice(all_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, del_genes])

for gene in del_genes:
    print('Generating %s deletion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta, 
                                     indel_range, out_prefix, vartype='DEL')

    tsv_info.append([tx, gene, varsize, exon, 'DEL'])

Generating KRT79 deletion...
Generating LOC105369807 deletion...
Generating LOC105369932 deletion...
Generating LINC00987 deletion...
Generating RP5-944M2.2 deletion...
Generating RP11-497G19.1 deletion...
Generating TMEM132C deletion...
Generating KRT6B deletion...
Generating LOC105369701 deletion...
Generating NECAP1 deletion...
CPU times: user 54.5 s, sys: 749 ms, total: 55.2 s
Wall time: 56.5 s


## Make insertion variants

In [6]:
%%time
%autoreload
np.random.seed(seed_init)
available_genes = list(set(all_genes).symmetric_difference(var_genes))
ins_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ins_genes])

for gene in ins_genes:
    print('Generating %s insertion...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='INS')

    tsv_info.append([tx, gene, varsize, exon, 'INS'])

Generating LOC105369697 insertion...
Generating KMT2D insertion...
Generating LOC105369682 insertion...
Generating LOC105369665 insertion...
Generating LOC102724261 insertion...
Generating LOC101928705 insertion...
Generating SMCO3 insertion...
Generating RP11-116D17.5 insertion...
Generating LOC100996246 insertion...
Generating RP11-465L8.1 insertion...
CPU times: user 52.6 s, sys: 604 ms, total: 53.2 s
Wall time: 54.1 s


## Make ITDs

In [7]:
%%time
np.random.seed(seed_init)
available_genes = list(set(all_genes).symmetric_difference(var_genes))
itd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, itd_genes])

for gene in itd_genes:
    print('Generating %s ITD...' % gene)
    tx = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene][0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='ITD')

    tsv_info.append([tx, gene, varsize, exon, 'ITD'])

Generating ARNTL2-AS1 ITD...
Generating GLTP ITD...
Generating ANAPC5 ITD...
Generating MORN3 ITD...
Generating LOC102724433 ITD...
Generating BCL7A ITD...
Generating ANKRD13A ITD...
Generating DDX47 ITD...
Generating RP11-83B20.3 ITD...
Generating CLEC4E ITD...
CPU times: user 52.8 s, sys: 604 ms, total: 53.4 s
Wall time: 54.3 s


## Make PTDs

In [8]:
%%time
%autoreload
np.random.seed(seed_init)

valid_txs, valid_genes = simu.get_valid_txs(all_exons, 2)
valid_txs = np.unique([tx for tx, gn in valid_txs])

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
ptd_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, ptd_genes])

for gene in ptd_genes:
    print('Generating %s PTD...' % gene)
    txs = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene]
    tx = [tx for tx in txs if tx in valid_txs][0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='PTD')

    tsv_info.append([tx, gene, n_exons, exon, 'PTD'])

Generating FAM222A-AS1 PTD...
Generating LOC105369902 PTD...
Generating HOXC8 PTD...
Generating CDKN1B PTD...
Generating LOC105369891 PTD...
Generating LOC105369602 PTD...
Generating RP11-328J6.1 PTD...
Generating LOC101927484 PTD...
Generating PRDM4 PTD...
Generating LOC107984458 PTD...
CPU times: user 58.5 s, sys: 882 ms, total: 59.4 s
Wall time: 1min 1s


## Make Inversions

In [9]:
%%time
%autoreload
np.random.seed(seed_init)

available_genes = list(set(valid_genes).symmetric_difference(var_genes))
inv_genes = np.random.choice(available_genes, n_vars, replace=False)
var_genes = np.concatenate([var_genes, inv_genes])

for gene in inv_genes:
    print('Generating %s inversion...' % gene)
    txs = [ex['transcript_id'] for ex in all_exons if simu.get_gene_name(ex) == gene]
    tx = [tx for tx in txs if tx in valid_txs][0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='INV')

    tsv_info.append([tx, gene, n_exons, exon, 'INV'])

Generating LOC107987180 inversion...
Generating IFT81 inversion...
Generating LOC105370034 inversion...
Generating LOC105370065 inversion...
Generating CACNA1C-AS1 inversion...
Generating HVCN1 inversion...
Generating RP11-20D14.3 inversion...
Generating AGAP2-AS1 inversion...
Generating AVPR1A inversion...
Generating NUP107 inversion...
CPU times: user 53 s, sys: 676 ms, total: 53.7 s
Wall time: 54.7 s


## Generate reads with Art

In [10]:
%%time
# generate reads with art illumina
np.random.seed(seed_init)
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.88 ms, sys: 10.5 ms, total: 13.4 ms
Wall time: 3.66 s


In [11]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 3.6 ms, sys: 17.8 ms, total: 21.4 ms
Wall time: 4.35 s


In [12]:
info = pd.DataFrame(tsv_info, columns=['tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')