In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

from datetime import datetime
import tempfile
import os
import gzip 
import subprocess


### Sim pipeline


In [2]:
from tools.SLiM_pipe_tools import (
    read_chrom_sizes, region_sample, region_samplev1,
    fasta_Rextract, write_fastaEx, process_recipe,
    SLiM_dispenser, 
)

In [3]:
print(os.getcwd())

/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe


In [4]:
## directories
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
slim_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
fastas_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/Fastas/'
##

## sub-directories.
sim_dir= main_dir + 'Recipes/Human_sims/'
sim_recipe= 'Gravel_2011_kmmVCF_fasta_pipe.slim'

dir_data= main_dir + 'mutation_counter/data/'
dir_vcf= dir_data + 'vcf_data/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'

summary_file= 'sims.log'

#
##
##
slim_soft= slim_dir + 'sim*'
sim_recipe= sim_dir + sim_recipe
##
##
#

In [5]:
## Simulation tag names, assembly to select from.
batch_name= 'Gravel'
assembly= 'hg38'

## files & variables
## fasta segment lengths; number of segments / sims.
L= int(1e4)
N= 2


In [45]:


def region_samplev2(L, chrom_sizes, N, fasta_file= ''):
    '''
    prepare sequence dictionary: {chrom:list(start pos)}
    provide to fasta_Rextract.
    
    ##
    '''
    
    chroms= list(chrom_sizes.keys())
    sizes= [int(chrom_sizes[x]) for x in chroms]
    sizes= np.array(sizes) / sum(sizes)
    
    choices= np.random.choice(chroms,size= N,p= sizes,replace= True)
    print(choices)
    
    seqs_store= {
        z: len([x for x in range(N) if choices[x]==z]) for z in list(set(choices))
    }
    
    seqs= fasta_RextractUnif(fasta_file, seqs_store, L= L)
    
    return seqs



def fasta_RextractUnif(fasta,seq_store,L= 10000):
    ''' 
    Extract regions from fasta file.
    Takes dictionary {chrom: list(start points)};
    extracts sequences after reading each chromosome.
    '''
    refseqs= {x:{} for x in seq_store.keys()}
    
    Outfiles= {
        x: tempfile.TemporaryFile() for x in seq_store.keys()
    }
    
    d= 0
     
    with gzip.open(fasta,'rb') as fp:
        for line in fp:
            #line= line.decode()
            if line[0:1] == b'>':
                head= line.decode()
                head= head.strip()
                if d != 0:
                    d=0

                head= head[1:].split('\t')
                head= head[0].strip('chr')

                if head in seq_store.keys():
                    print('opening fasta chr: {}'.format(head))
                    d= head
                    outfile= Outfiles[d]
                    continue
            
            if d != 0:
                processed_line= line.upper().strip(b'\n')
                outfile.write(processed_line)
    
    for chrom in Outfiles.keys():
        f= Outfiles[chrom]
        f.seek(os.SEEK_SET)
        result= f.read().decode()
        f.close()
        
        chrom_seqs= return_seqs(result,size= seq_store[chrom],L= L)
        
        refseqs[chrom]= chrom_seqs
    
    return refseqs



def return_seqs(seq,size= 10,L= 1000,keep= ['A','T','G','C']):
    
    d= 0
    
    seqL= len(seq)
    seq_dict= {}
    
    while d <= size:
        pos= np.random.randint(low= 0, high= seqL - L,size= 1)[0]
        given= seq[pos:(pos + L)]
        
        scrag= [x for x in given if x not in keep]
        
        if not scrag:
            seq_dict[pos]= given
            
            d += 1
    
    return seq_dict


In [46]:
import numpy as np

## Read chrom_sizes file to decide where to sample files from. 
chrom_sizes= read_chrom_sizes(assembly)

## Sample fasta.
##
fasta= fastas_dir + assembly + '.fa.gz'
rseqs= region_samplev2(L, chrom_sizes,N, fasta)


['7' '22']
opening fasta chr: 22
opening fasta chr: 7
AACCATTATA
AAAAAATTAG
AAATGTCAAC
ATTTAATACA


In [56]:

def cook_constants_v1(fasta_dict, dir_data= "./data/", 
            dir_vcf= "vcf_data/", slim_dir= './', batch_name= ''):
    '''
    set up conditions.
    constants:
        - vcf_file;
        - fasta_file - writes fasta; mkdir fasta_dir
    '''
    sim_store= {}
    
    for chrom in fasta_dict.keys():
        for start in fasta_dict[chrom].keys():
            fasta= fasta_dict[chrom][start]

            ### set up names and directories.
            SIMname= batch_name + 'C{}.{}'.format(chrom,str(start))
            vcf_file= dir_vcf + SIMname + "_chr{}.vcf".format(chrom)
            ref_dir= dir_data + SIMname + '_reference'
            os.makedirs(ref_dir, exist_ok=True)
            
            ### write fasta file for SLiM.
            fasta_file= write_fastaEx(fasta,chrom=chrom,start= start,
                          ID= SIMname,fasta_dir= ref_dir)
            
            sim_store[SIMname]= {
                "vcf_file": vcf_file,
                "fasta_file": fasta_file
            }
    
    return sim_store


def cook_constants_v2(fasta_dict, dir_data= "./data/", 
            dir_vcf= "vcf_data/", slim_dir= './', batch_name= ''):
    '''
    set up conditions.
    constants:
        - vcf_file;
        - fasta_file - writes fasta; mkdir fasta_dir
    '''
    sim_store= {}
    
    for chrom in fasta_dict.keys():
        for start in fasta_dict[chrom].keys():
            fasta= fasta_dict[chrom][start]

            ### set up names and directories.
            SIMname= batch_name + 'C{}.{}'.format(chrom,str(start))
            vcf_file= dir_vcf + SIMname + "_chr{}.vcf".format(chrom)
            ref_dir= dir_data + SIMname + '_reference'
            os.makedirs(ref_dir, exist_ok=True)
            
            ### write fasta file for SLiM.
            fasta_file= write_fastaEx(fasta,chrom=chrom,start= start,
                          ID= SIMname,fasta_dir= ref_dir)
            
            sim_store[SIMname]= {
                "vcf_file": vcf_file,
                "fasta_file": fasta_file
            }
    
    return sim_store


def SLiM_dispenserv1(sim_store, slim_dir= './', batch_name= '',
                    logSims= 'sims.log', mutlog= 'toMut.log'):
    ''' execute SLiM program
    - simulation specific recipe:
    - recipe template is re-written to direct to new fasta.
    '''
    nf= len(sim_store)
    for SIMname in sim_store.keys():
        
        command_line_constants= sim_store[SIMname]

        ### generate modified slim recipe
        new_recipe= process_recipe(sim_recipe,command_line_constants, SIMname)

        seed= np.random.randint(0,high= nf,size= 1)[0]
        ### Launch SLiM through shell.
        slim_soft= slim_dir + 'slim*' 

        command_units= [slim_soft, '-m', '-s', str(seed), new_recipe]

        os.system(' '.join(command_units))

        os.system('gzip {}'.format(sim_store[SIMname]["vcf_file"]))
        os.system('gzip {}'.format(sim_store[SIMname]["fasta_file"]))
        os.system('rm {}'.format(new_recipe))

        now = datetime.now()
        tnow= now.strftime("%d/%m/%Y %H:%M:%S")

        with open(logSims,'a') as fp:
            fp.write(SIMname + '\t' + tnow + '\n')
        with open(mutlog,'a') as fp:
            fp.write(SIMname + '\n')




In [57]:
mutlog= 'toMut.log'
## Perform Simulations
print('launch SLiM jobs.')

sim_store= cook_constants_v1(rseqs,dir_data= dir_data, dir_vcf= dir_vcf, 
               slim_dir= slim_dir, batch_name= batch_name)

SLiM_dispenserv1(sim_store, slim_dir= slim_dir, batch_name= batch_name,
                    logSims= summary_file, mutlog= mutlog)

In [None]:

SLiM_dispenser(rseqs,sim_recipe,dir_data= dir_data, dir_vcf= dir_vcf, 
               slim_dir= slim_dir, logSims= summary_file,
               mutlog= mutlog,batch_name= batch_name)


In [58]:
from tools.SLiM_pipe_tools import mutation_counter_launch

mutlog= 'toMut.log'

print('launch mutation counter.')
mutation_counter_launch(mutlog,count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir)


launch mutation counter.
['GravelC7.145319599\n', 'GravelC7.10890494\n', 'GravelC22.40861331\n', 'GravelC22.46101549\n']
