In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
import simu
import subprocess
from pybedtools import BedTool
from IPython.core.debugger import set_trace

%load_ext autoreload

# high-level parameters
seed_init = 234
n_vars = 10
n_background_genes = 100
min_exons = 3

# simulation parameters
block_range = (30, 200)

# read generation parameters
fold = 50
frag_size = 300
frag_sd = 20
read_len = 150

# references/output files
art_illumina = '/Users/marek.cmero/apps/art_bin_MountRainier/ART_ILLUMINA'
genome_fasta = '/Users/marek.cmero/reference/fastas/Homo_sapiens.GRCh38.dna.primary_assembly.fa'
gtf_ref = '/Users/marek.cmero/reference/gtf/chess_mini_chr12_ref.gtf'
out_prefix = '/Users/marek.cmero/Desktop/output/splicevars'
control_fasta = '%s-control.fasta' % out_prefix
case_fasta = '%s-case.fasta' % out_prefix

In [2]:
# build GTF reference
gr = BedTool(gtf_ref)

# make gene start/end reference
gene_trees = simu.get_gene_features(gr)

# get exons for records that have a gene name
all_exons = gr.filter(lambda x: x[2] == 'exon').saveas()
all_genes = np.unique([simu.get_gene_name(ex) for ex in all_exons if simu.get_gene_name(ex)!=''])
var_genes = np.empty(0)
splice_info = []

# get valid txs
valid_txs, valid_genes = simu.get_valid_txs(all_exons, min_exons)
valid_txs = np.unique([tx for tx, gn in valid_txs])
available_genes = [gene for gene in all_genes if gene in valid_genes]

# cleanup and make outdir
if os.path.exists(control_fasta):
    os.remove(control_fasta)
if os.path.exists(case_fasta):
    os.remove(case_fasta)
outdir = '/'.join(out_prefix.split('/')[:-1])
subprocess.call(['mkdir', '-p', outdir])

0

## Extended exon variants

In [3]:
# %%time
%autoreload

np.random.seed(seed_init)
ee_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ee_genes:
    print('Generating %s extended exon...' % gene)
    
    tx, chrom = simu.pick_transcript(gene, all_exons, valid_txs=valid_txs)
    varsize, exon = simu.write_novel_exon(tx, all_exons, genome_fasta, out_prefix, 
                                          block_range, vartype='EE')

    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'EE'])

Generating LOC105369720 extended exon...
Generating ZCCHC8 extended exon...
Generating GTSF1 extended exon...
Generating WSB2 extended exon...
Generating CDK4 extended exon...
Generating STAT6 extended exon...
Generating KRR1 extended exon...
Generating LMNTD1 extended exon...
Generating TM7SF3 extended exon...
Generating SLC15A4 extended exon...


## Novel exon variants

In [4]:
%%time

ne_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ne_genes:
    print('Generating %s novel exon...' % gene)
    
    tx, chrom = simu.pick_transcript(gene, all_exons, valid_txs=valid_txs)
    varsize, exon = simu.write_novel_exon(tx, all_exons, genome_fasta, out_prefix,
                                          block_range, vartype='NE')

    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'NE'])

Generating LGR5 retained intron...
Generating MCRS1 retained intron...
Generating LOC105369811 retained intron...
Generating LOC105369887 retained intron...
Generating KITLG retained intron...
Generating LUM retained intron...
Generating LOC105370059 retained intron...
Generating TBX5-AS1 retained intron...
Generating KSR2 retained intron...
Generating CEP290 retained intron...
CPU times: user 54.8 s, sys: 658 ms, total: 55.5 s
Wall time: 59.6 s


## Retained intron variants

In [5]:
%%time
%autoreload
ri_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in ri_genes:
    print('Generating %s retained intron...' % gene)
    
    tx, chrom = simu.pick_transcript(gene, all_exons, valid_txs=valid_txs)
    varsize, exon = simu.write_novel_exon(tx, all_exons, genome_fasta, out_prefix,
                                          block_range, vartype='RI')

    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'RI'])

Generating SP1 retained intron...
Generating COQ5 retained intron...
Generating MVK retained intron...
Generating TSFM retained intron...
Generating LOC107984475 retained intron...
Generating UTP20 retained intron...
Generating LOC102723544 retained intron...
Generating TPH2 retained intron...
Generating MGST1 retained intron...
Generating LOC101930023 retained intron...
CPU times: user 53.7 s, sys: 578 ms, total: 54.3 s
Wall time: 57.7 s


## Truncated exon variants

Truncate adjacent exons to create novel exon junction variant.

In [6]:
%%time
%autoreload

nej_genes, available_genes = simu.pick_genes(n_vars, available_genes)

for gene in nej_genes:
    print('Generating %s truncated exon...' % gene)
    
    tx, chrom = simu.pick_transcript(gene, all_exons, valid_txs=valid_txs)
    varsize, exon = simu.write_trunc_exons(tx, all_exons, genome_fasta, out_prefix, block_range)

    loc = simu.get_gene_loc(chrom, gene_trees, gene)
    splice_info.append([loc, tx, gene, varsize, exon, 'NEJ'])

Generating LOC105370079 truncated exon...
Generating FAM66C truncated exon...
Generating FAM186A truncated exon...
Generating GLIPR1L1 truncated exon...
Generating GXYLT1 truncated exon...
Generating LOC105370027 truncated exon...
Generating RP11-486A14.2 truncated exon...
Generating LOC100507377 truncated exon...
Generating LOC107984528 truncated exon...
Generating LINC00937 truncated exon...
CPU times: user 1min 15s, sys: 592 ms, total: 1min 15s
Wall time: 1min 18s


## Compile info on variants

In [7]:
info = pd.DataFrame(splice_info, columns=['loc', 'tx', 'gene', 'size', 'exon', 'vartype'])
info.to_csv('%s_simulated.tsv' % out_prefix, index=False, sep='\t')

## Generate reads

In [8]:
%%time
# generate reads with art illumina
seeds = np.random.randint(0, 99999, 2)

# generate case sample
subprocess.call([art_illumina, '-ss', 'HS25', '-i', case_fasta, 
                 '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
                 '-s', str(frag_sd), '-rs', str(seeds[0]), '-o', '%s-case_R' % out_prefix])

# generate control
subprocess.call([art_illumina, '-ss', 'HS25', '-i', control_fasta, 
             '-p', '-l', str(read_len), '-f', str(fold), '-m', str(frag_size),
             '-s', str(frag_sd), '-rs', str(seeds[1]), '-o', '%s-control_R' % out_prefix])

# gzip files
for sample in ['case', 'control']:
    for r in range(2):
        outf = open('%s-%s_R%d.fastq.gz' % (out_prefix, sample, (r+1)), 'w')
        subprocess.call(['gzip', '-c', '%s-%s_R%d.fq' % (out_prefix, sample, (r+1))], stdout=outf)
        outf.close()

CPU times: user 4.58 ms, sys: 23.8 ms, total: 28.4 ms
Wall time: 7.85 s
