In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

from datetime import datetime
import tempfile
import os
import gzip 
import subprocess


### Sim pipeline


In [2]:
from tools.SLiM_pipe_tools import (
    read_chrom_sizes, region_sample, region_samplev1,
    fasta_Rextract, write_fastaEx, process_recipe,
    SLiM_dispenser, 
)

In [3]:
## directories
main_dir= '/mnt/c/Users/floyd/Desktop/AZ/SuzLab/SLiM/'
slim_dir= '/mnt/c/Users/floyd/Desktop/AZ/SuzLab/SLiM/'
fastas_dir= '/mnt/c/Users/floyd/Desktop/AZ/SuzLab/SLiM/Fastas/'
##

## sub-directories.
sim_dir= main_dir + 'Recipes/Human_sims/'
sim_recipe= 'Gravel_2011_kmmVCF_fasta_pipe.slim'

dir_data= main_dir + 'mutation_counter/data/'
dir_vcf= dir_data + 'vcf_data/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'

summary_file= 'sims.log'

#
##
##
slim_soft= slim_dir + 'sim*'
sim_recipe= sim_dir + sim_recipe
##
##
#

In [4]:
## Simulation tag names, assembly to select from.
batch_name= 'Gravel'
assembly= 'hg38'

## files & variables
## fasta segment lengths; number of segments / sims.
L= int(1e4)
N= 2


In [None]:


def region_samplev2(L, chrom_sizes, N, fasta_file= ''):
    '''
    prepare sequence dictionary: {chrom:list(start pos)}
    provide to fasta_Rextract.
    
    ##
    '''
    
    
    
    chroms= np.random.choice(list(chrom_sizes.keys()),N,replace= True)
    chroms= sorted(chroms)
    chroms= {z: chroms.count(z) for z in list(set(chroms))}
    
    seqs_store= {
        z: np.random.randint(low= 0,high= chrom_sizes[z] - L,size= chroms[z]) for z in chroms.keys()
    }
    
    seqs= fasta_Rextract(fasta_file, seqs_store, L= L)
    
    
    return seqs



def fasta_Rextract(fasta,seq_store,L= 10000):
    ''' 
    Extract regions from fasta file.
    Takes dictionary {chrom: list(start points)};
    extracts sequences after reading each chromosome.
    '''
    refseqs= {x:{} for x in seq_store.keys()}
    
    Outfiles= {
        x: tempfile.TemporaryFile() for x in seq_store.keys()
    }
    
    d= 0
     
    with gzip.open(fasta,'rb') as fp:
        for line in fp:
            #line= line.decode()
            if line[0:1] == b'>':
                head= line.decode()
                head= head.strip()
                if d != 0:
                    d=0

                head= head[1:].split('\t')
                head= head[0].strip('chr')

                if head in seq_store.keys():
                    print('opening fasta chr: {}'.format(head))
                    d= head
                    outfile= Outfiles[d]
                    continue
            
            if d != 0:
                processed_line= line.upper().strip(b'\n')
                outfile.write(processed_line)
    
    for chrom in Outfiles.keys():
        f= Outfiles[chrom]
        f.seek(os.SEEK_SET)
        result= f.read().decode()
        f.close()
        
        starts= seq_store[chrom]
        
        for s in starts:
            refseqs[chrom][s]= result[s:(s+L)]
    
    return refseqs




In [6]:

## Read chrom_sizes file to decide where to sample files from. 
chrom_sizes= read_chrom_sizes(assembly)

## Sample fasta.
##
fasta= fastas_dir + assembly + '.fa.gz'
rseqs= region_samplev1(L, chrom_sizes,N, fasta)



opening fasta chr: 14
opening fasta chr: 21
launch SLiM jobs.


In [None]:
## Perform Simulations
print('launch SLiM jobs.')
SLiM_dispenser(rseqs,sim_recipe,dir_data= dir_data, dir_vcf= dir_vcf, 
               slim_dir= slim_dir, logSims= summary_file,
               mutlog= mutlog,batch_name= batch_name)


In [29]:
from tools.SLiM_pipe_tools import mutation_counter_launch

mutlog= 'toMut.log'

print('launch mutation counter.')
mutation_counter_launch(mutlog,count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir)


launch mutation counter.
['GravelC14.44356243']
