In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool
from IPython.core.debugger import set_trace

%load_ext autoreload

# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
block_range = (30, 200)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
junc_ref_file = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.info'
out_prefix = '/Users/marek.cmero/Desktop/output/splicevars'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
%autoreload
# build GTF reference
gr = BedTool(gtf_ref)
junc_ref = simu.build_junc_ref(junc_ref_file)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons for records that have a gene name
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
splice_info = []

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
available_genes = [gene for gene in all_genes if gene in valid_genes]

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

0

## Extended exon variants

In [3]:
%%time
%autoreload
np.random.seed(seed_init)
ee_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ee_genes:
    print('Generating %s extended exon...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='EE')
    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'EE'])

Generating LOC105369720 extended exon...
Generating ZCCHC8 extended exon...
Generating GTSF1 extended exon...
Generating WSB2 extended exon...
Generating CDK4 extended exon...
Generating STAT6 extended exon...
Generating KRR1 extended exon...
Generating LMNTD1 extended exon...
Generating TM7SF3 extended exon...
Generating SLC15A4 extended exon...
CPU times: user 1min 59s, sys: 1.84 s, total: 2min 1s
Wall time: 2min 8s


## Novel exon variants

In [4]:
%%time
ne_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ne_genes:
    print('Generating %s novel exon...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='NE')
    while tx == '':
        # no valid genes, repick
        print('%s has no suitable transcripts, repicking...' % gene) 
        genes, available_genes = simu.pick_genes(1, available_genes)
        gene = genes[0]
        print('Generating %s retained intron...' % gene)
        tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                               genome_fasta, out_prefix, 
                                               block_range, gene_trees, vartype='NE')

    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'NE'])

Generating C12orf66 novel exon...
Generating TUBA1B novel exon...
TUBA1B has no suitable transcripts, repicking...
Generating LOC101929469 retained intron...
Generating SLC2A13 novel exon...
Generating BCAT1 novel exon...
Generating LGR5 novel exon...
Generating LOC101927882 novel exon...
Generating SLCO1C1 novel exon...
Generating RNFT2 novel exon...
Generating RP11-121G22.3 novel exon...
Generating C12orf40 novel exon...
CPU times: user 2min 55s, sys: 2.54 s, total: 2min 57s
Wall time: 3min 10s


## Retained intron variants

In [6]:
%%time

ri_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ri_genes:
    print('Generating %s retained intron...' % gene)    
    tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                           genome_fasta, out_prefix, 
                                           block_range, gene_trees, vartype='RI')
    while tx == '':
        # no valid genes, repick
        print('%s has no suitable transcripts, repicking...' % gene) 
        genes, available_genes = simu.pick_genes(1, available_genes)
        gene = genes[0]
        print('Generating %s retained intron...' % gene)
        tx, loc, stats = simu.write_novel_exon(gene, valid_txs, all_exons,
                                               genome_fasta, out_prefix, 
                                               block_range, gene_trees, vartype='RI')
    
    splice_info.append([loc, tx, gene, stats['varsize'], stats['exon'], 'RI'])

Generating PTPRB retained intron...
Generating LOC107984543 retained intron...
Generating LOC105370005 retained intron...
Generating LOC105369710 retained intron...
Generating CISTR retained intron...
Generating LINC00612 retained intron...
Generating MANSC1 retained intron...
Generating LOC105369963 retained intron...
Generating BLOC1S1-RDH5 retained intron...
Generating WBP11 retained intron...
CPU times: user 2min 2s, sys: 1.51 s, total: 2min 4s
Wall time: 2min 11s


## Truncated exon variants

Truncate adjacent exons to create novel exon junction variant.

In [7]:
%%time

nej_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in nej_genes:
    print('Generating %s truncated exon...' % gene)
    
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    varsize, exon = simu.write_trunc_exons(tx, all_exons, genome_fasta, out_prefix, block_range)

    chrom = simu.get_tx_chrom(tx, all_exons)
    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'NEJ'])

Generating CCDC59 truncated exon...
Generating KRAS truncated exon...
Generating DTX3 truncated exon...
Generating UTP20 truncated exon...
Generating LHX5 truncated exon...
Generating LOC105370087 truncated exon...
Generating KRT3 truncated exon...
Generating RP11-983C2.2 truncated exon...
Generating GRIN2B truncated exon...
Generating FAM19A2 truncated exon...
CPU times: user 1min 46s, sys: 1.02 s, total: 1min 47s
Wall time: 1min 51s


## Make unannotated splice variants

In [8]:
%%time

us_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in us_genes:
    print('Generating %s unannotated splice variant...' % gene)
    tx, loc, stats = simu.write_unannot_splice(gene, all_exons, valid_txs, genome_fasta,
                                               out_prefix, junc_ref, gene_trees)
    splice_info.append([loc, tx, gene, varsize, exon, 'US'])

Generating LOC101929974 unannotated splice variant...
Generating PTHLH unannotated splice variant...
Generating LOC105370092 unannotated splice variant...
Generating LINC00507 unannotated splice variant...
Generating LOC101927583 unannotated splice variant...
Generating PRB4 unannotated splice variant...
Generating LOC107984540 unannotated splice variant...
Generating RMST unannotated splice variant...
Generating LOC101928346 unannotated splice variant...
Generating LOC105369689 unannotated splice variant...
CPU times: user 1min 40s, sys: 874 ms, total: 1min 40s
Wall time: 1min 42s


## Background genes

In [9]:
%%time
# write background genes
bg_set, available_genes = simu.pick_genes(n_background_genes, available_genes)
for gene in bg_set:
    tx = simu.get_transcripts(gene, all_exons, valid_txs=valid_txs)[0]
    exons = all_exons.filter(lambda x: x['transcript_id'] == tx).saveas()
    tx_seq, strand = simu.get_seq(exons, genome_fasta)
    simu.write_wildtype_sequence(tx_seq, strand, control_fasta, tx)
    simu.write_wildtype_sequence(tx_seq, strand, case_fasta, tx)

CPU times: user 8min 7s, sys: 4.06 s, total: 8min 11s
Wall time: 8min 20s


## Compile info on variants

In [10]:
info = pd.DataFrame(splice_info, columns=['loc', 'tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')

## Generate reads

In [11]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

# gzip files
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 6.3 ms, sys: 30.4 ms, total: 36.7 ms
Wall time: 21.3 s
