In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool
from IPython.core.debugger import set_trace

%load_ext autoreload

# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
block_range = (30, 200)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
junc_ref_file = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.info'
out_prefix = '/Users/marek.cmero/Desktop/output/splicevars'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
%autoreload
# build GTF reference
gr = BedTool(gtf_ref)
junc_ref = simu.build_junc_ref(junc_ref_file)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons for records that have a gene name
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
splice_info = []

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
available_genes = [gene for gene in all_genes if gene in valid_genes]

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

0

## Extended exon variants

In [3]:
%%time
%autoreload
np.random.seed(seed_init)
ee_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ee_genes:
    print('Generating %s extended exon...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='EE')
    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'EE'])

Generating LOC105369720 extended exon...
Generating ZCCHC8 extended exon...
Generating GTSF1 extended exon...
Generating WSB2 extended exon...
Generating CDK4 extended exon...
Generating STAT6 extended exon...
Generating KRR1 extended exon...
Generating LMNTD1 extended exon...
Generating TM7SF3 extended exon...
Generating SLC15A4 extended exon...
CPU times: user 1min 50s, sys: 1.15 s, total: 1min 52s
Wall time: 1min 58s


## Novel exon variants

In [4]:
%%time
ne_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ne_genes:
    print('Generating %s novel exon...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='NE')
    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'NE'])

Generating C12orf66 novel exon...
Generating PLXNC1 novel exon...
Generating SMAGP novel exon...
Generating BCAT1 novel exon...
Generating LGR5 novel exon...
Generating LOC101927882 novel exon...
Generating MYL2 novel exon...
Generating RP1-166H1.2 novel exon...
Generating RP11-121J20.1 novel exon...
Generating C12orf40 novel exon...
CPU times: user 1min 52s, sys: 1.24 s, total: 1min 54s
Wall time: 2min 1s


## Retained intron variants

In [5]:
%%time
ri_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ri_genes:
    print('Generating %s retained intron...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='RI')
    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'RI'])

Generating ERBB3 retained intron...
Generating LOC105369859 retained intron...
Generating PRH2 retained intron...
Generating LOC105369632 retained intron...
Generating KANSL2 retained intron...
Generating ELK3 retained intron...
Generating LOC101927019 retained intron...
Generating TRHDE retained intron...
Generating LOC100506159 retained intron...
Generating ETFBKMT retained intron...
CPU times: user 1min 44s, sys: 1.01 s, total: 1min 45s
Wall time: 1min 49s


## Truncated exon variants

Truncate adjacent exons to create novel exon junction variant.

In [6]:
%%time

nej_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in nej_genes:
    print('Generating %s truncated exon...' % gene)
    
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    varsize, exon = simu.write_trunc_exons(tx, all_exons, genome_fasta, out_prefix, block_range)

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'NEJ'])

Generating LDHB truncated exon...
Generating BLOC1S1-RDH5 truncated exon...
Generating FRS2 truncated exon...
Generating RPS26 truncated exon...
Generating LOC105369686 truncated exon...
Generating ANAPC7 truncated exon...
Generating LOC105369609 truncated exon...
Generating LOC105369890 truncated exon...
Generating DRAM1 truncated exon...
Generating FAM66C truncated exon...
CPU times: user 1min 42s, sys: 832 ms, total: 1min 43s
Wall time: 1min 46s


## Make unannotated splice variants

In [8]:
# %%time
%autoreload

us_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in us_genes:
    print('Generating %s unannotated splice variant...' % gene)
    tx, loc, stats = simu.write_unannot_splice(gene, all_exons, valid_txs, genome_fasta,
                                               out_prefix, junc_ref, gene_trees)
    splice_info.append([loc, tx, gene, varsize, exon, 'US'])

Generating RILPL1 unannotated splice variant...
Generating LOC105370029 unannotated splice variant...
Generating HECTD4 unannotated splice variant...
Generating CDCA3 unannotated splice variant...
Generating LOC105369607 unannotated splice variant...
Generating LOC105369958 unannotated splice variant...
Generating CLEC2D unannotated splice variant...
Generating RNF10 unannotated splice variant...
Generating LOC100506551 unannotated splice variant...
Generating FGD4 unannotated splice variant...


## Compile info on variants

In [None]:
info = pd.DataFrame(splice_info, columns=['loc', 'tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')

## Generate reads

In [None]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

# gzip files
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()