In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool
from IPython.core.debugger import set_trace

%load_ext autoreload

# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100

# simulation parameters
indel_range = (7, 50)
exons_range = (1, 3)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/tsvs'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
%%time

# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons for records that have a gene name
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
tsv_info = []

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

CPU times: user 12.3 s, sys: 231 ms, total: 12.6 s
Wall time: 12.7 s


## Make deletion variants

In [3]:
%%time

np.random.seed(seed_init)
del_genes, available_genes = simu.pick_genes(n_vars, all_genes)

for gene in del_genes:
    print('Generating %s deletion...' % gene)
    
    tx = simu.get_transcripts(gene, all_exons)[0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta, 
                                     indel_range, out_prefix, vartype='DEL')

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'DEL'])

Generating KRT79 deletion...
Generating LOC105369807 deletion...
Generating LOC105369932 deletion...
Generating LINC00987 deletion...
Generating RP5-944M2.2 deletion...
Generating RP11-497G19.1 deletion...
Generating TMEM132C deletion...
Generating KRT6B deletion...
Generating LOC105369701 deletion...
Generating NECAP1 deletion...
CPU times: user 1min 13s, sys: 620 ms, total: 1min 13s
Wall time: 1min 14s


## Make insertion variants

In [4]:
%%time

ins_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ins_genes:
    print('Generating %s insertion...' % gene)

    tx = simu.get_transcripts(gene, all_exons)[0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='INS')

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'INS'])

Generating RP11-70F11.11 insertion...
Generating ZNF10 insertion...
Generating RASSF8-AS1 insertion...
Generating TBX5-AS1 insertion...
Generating CCER1 insertion...
Generating LOC107984531 insertion...
Generating LOC100506691 insertion...
Generating LOC101928362 insertion...
Generating C12orf43 insertion...
Generating OR6C4 insertion...
CPU times: user 1min 10s, sys: 398 ms, total: 1min 10s
Wall time: 1min 11s


## Make ITDs

In [5]:
%%time

itd_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in itd_genes:
    print('Generating %s ITD...' % gene)

    tx = simu.get_transcripts(gene, all_exons)[0]
    varsize, exon = simu.write_indel(tx, all_exons, genome_fasta,
                                     indel_range, out_prefix, vartype='ITD')

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, varsize, exon, 'ITD'])

Generating RP11-424M22.3 ITD...
Generating ATXN7L3B ITD...
Generating LLPH ITD...
Generating LALBA ITD...
Generating FOXJ2 ITD...
Generating RP11-69M1.6 ITD...
Generating LOC101929432 ITD...
Generating TRHDE ITD...
Generating LOC102724834 ITD...
Generating LOC107987180 ITD...
CPU times: user 1min 10s, sys: 395 ms, total: 1min 10s
Wall time: 1min 11s


## Make PTDs

In [6]:
%%time

valid_txs, valid_genes = simu.get_valid_txs(all_exons, 2)
valid_txs = np.unique([tx for tx, gn in valid_txs])

available_genes = [gene for gene in available_genes if gene in valid_genes]
ptd_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ptd_genes:
    print('Generating %s PTD...' % gene)
    
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                     out_prefix, exons_range, vartype='PTD')

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, n_exons, exon, 'PTD'])

Generating RP11-320N7.2 PTD...
Generating DNAH10 PTD...
Generating LACRT PTD...
Generating LOC105369965 PTD...
Generating LOC101928002 PTD...
Generating LOC105369878 PTD...
Generating MYF5 PTD...
Generating TM7SF3 PTD...
Generating ART4 PTD...
Generating KLRC4-KLRK1 PTD...
CPU times: user 1min 16s, sys: 604 ms, total: 1min 16s
Wall time: 1min 17s


## Make Inversions

In [7]:
%%time

inv_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in inv_genes:
    print('Generating %s inversion...' % gene)

    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    n_exons, exon = simu.write_large_tsv(tx, all_exons, genome_fasta,
                                         out_prefix, exons_range, vartype='INV')

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    tsv_info.append([loc, tx, gene, n_exons, exon, 'INV'])

Generating RP13-81N3.2 inversion...
Generating LOC107984446 inversion...
Generating RP5-944M2.1 inversion...
Generating PUS1 inversion...
Generating RP11-413B19.2 inversion...
Generating PLEKHG6 inversion...
Generating FAM109A inversion...
Generating LOC100128276 inversion...
Generating LOC107984433 inversion...
Generating KRT71 inversion...
CPU times: user 1min 16s, sys: 816 ms, total: 1min 17s
Wall time: 1min 18s


## Background genes

In [8]:
%%time
bg_set, available_genes = simu.pick_genes(n_background_genes, available_genes)
for gene in bg_set:
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_wildtype_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_wildtype_sequence(tx_seq, strand, case_fasta, tx)

CPU times: user 8min 37s, sys: 6.18 s, total: 8min 43s
Wall time: 8min 53s


## Generate reads with Art

In [9]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

CPU times: user 2.03 ms, sys: 9.16 ms, total: 11.2 ms
Wall time: 9.49 s


In [10]:
%%time
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 4.48 ms, sys: 18.3 ms, total: 22.8 ms
Wall time: 11 s


In [11]:
info = pd.DataFrame(tsv_info, columns=['loc', 'tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')