In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess
import time

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

In [2]:
from tools.mcounter_tools import (
    read_vcf_allel, ind_assignment_scatter_v1, MC_sample_matrix_v1,
    heatmap_v2, ind_assignment_SFS, read_windows_SFS
)

In [3]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
import re
import pandas as pd


## directories
main_dir= os.getcwd() + '/'
sims_dir= main_dir + 'mutation_counter/data/sims_burnin_v2/'
diffs= False
frequency_range= [0,1]
args= True

data_kmer, data= read_windows_SFS(diffs= diffs, frequency_range= frequency_range,indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    sim_dir= sims_dir, muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False,args= args)

missing: 0, no vcf: 65
385
time elapsed: 604.2579879760742s


### I. Sample SFS

We plot the SFS of a sample of all windows available (`Nplot` argument below). For visualization purposes. Tests of convergence to expectation are performed in section II. 

In [4]:
available= list(data.keys())
times_dict= {
    sim_t: int(data[sim_t]['args']['evt']) for sim_t in available
}

sorted_avail= sorted(times_dict,key= times_dict.get,reverse= False)

Nplot= 10
plot_whom= np.linspace(0,len(available)-1,Nplot,dtype=int)

N_inds= data[available[0]]['geno'].shape[0]
n_bins= np.linspace(0,1.01,N_inds)
pop= 'pop0'

fig= []

for sim_idx in plot_whom:
    sim_t= sorted_avail[sim_idx]
    gen_time= data[sim_t]['args']['evt']
    freqs= data[sim_t]['freqs'][pop]
    N= len(freqs)
    
    bin_count,bin_middle= np.histogram(freqs,bins= N_inds,range= [0,1])
    bin_count= bin_count / sum(bin_count)
    bin_middle= [(bin_middle[x] + bin_middle[x-1]) / 2 for x in range(1,len(bin_middle))]
    
    fig_sim= go.Scatter(
        x= bin_middle,
        y= bin_count,
        name= str(int(gen_time) + 1000)
    )
    
    fig.append(fig_sim)

layout= go.Layout(
    xaxis= dict(
        range= [-0.1,1.1],
        title= 'frequency'
    ),
    yaxis= dict(
        title= 'N'
    ),
    title= 'SFS by burnin'
)
Figure= go.Figure(data=fig,layout=layout)

iplot(Figure)

**Fig. 1.** Linear scatter plot of SFS sample. 

In [50]:
available= list(data.keys())
times_dict= {
    sim_t: int(data[sim_t]['args']['evt']) for sim_t in available
}

sorted_avail= sorted(times_dict,key= times_dict.get,reverse= False)

Nplot= 10
plot_whom= np.linspace(0,len(available)-1,Nplot,dtype=int)

Nbin= N_inds
n_bins= np.linspace(0,1.01,Nbin)
pop= 'pop0'

fig= go.Figure()

for sim_idx in plot_whom:
    sim_t= sorted_avail[sim_idx]
    gen_time= data[sim_t]['args']['evt']
    freqs= data[sim_t]['freqs'][pop]
    N= len(freqs)
    
    fig.add_trace(go.Histogram(
        x=freqs,
        histnorm='percent',
        name=str(int(gen_time) + 1000), # name used in legend and hover labels
        xbins=dict( # bins used for histogram
            start=0,
            end=1.01,
            size= 1 / N_inds
        ),
        #marker_color='#EB89B5',
        opacity=0.75
    ))

##### Expected site frequency spectrum
#####
mu= 1e-8
Ne= 20000
Theta= 4 * Ne * mu
Nsamp= 1092

freq_exp= [Theta / x for x in range(1,Nsamp+1)]
freq_exp= np.array(freq_exp) / np.sum(freq_exp)
freqs_possible= [x / (2*Nsamp) for x in range(1,Nsamp+1)]

freq_sample= np.random.choice(freqs_possible,len(freqs),p=freq_exp)

fig.add_trace(go.Histogram(
    x=freq_sample,
    histnorm='percent',
    name='expected', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=0,
        end=1.01,
        size= 1 / N_inds
    ),
    #marker_color='#EB89B5',
    opacity=0.75
))

#####

layout= go.Layout(
    xaxis= dict(
        range= [-0.1,1.1],
        title= 'frequency'
    ),
    yaxis= dict(
        title= 'N'
    ),
    title= 'SFS by burnin'
)

fig['layout']= layout

#fig.to_image(format="png", width=2000, height=1500, scale=2)
iplot(fig)

**Fig. 2.** SFS histogram of window sample. Expected SFS under neutrality for an arbitrary number of SNPs also plotted. Drawn by sampling, so deviation is expected. see section 2 test of convergence. 

## Estimating convergence of SFS

> i. Among simulated SFS vectors. 

In [39]:

counts= []

for sim_idx in range(len(sorted_avail)):
    sim_t= sorted_avail[sim_idx]
    gen_time= data[sim_t]['args']['evt']
    #print(gen_time)
    freqs= data[sim_t]['freqs'][pop]
    N= len(freqs)

    #n_bins= np.linspace(0,1.01,N_inds)
    #bin_middle= [(n_bins[x] + n_bins[x-1]) / 2 for x in range(1,len(n_bins))]
    #bin_count= [[x for x in freqs if x >= n_bins[z-1] and x < n_bins[z]] for z in range(1,len(n_bins))]
    #bin_count= [len(x) / N for x in bin_count]
    
    bin_count,bin_middle= np.histogram(freqs,bins= N_inds,range= [0,1])
    bin_count= bin_count / sum(bin_count)
    bin_middle= [(bin_middle[x] + bin_middle[x-1]) / 2 for x in range(1,len(bin_middle))]
    
    bin_count= np.array(bin_count)
    counts.append(bin_count)



In [70]:
def set_SSD(set1,set2):
    '''
    return sum of squared differences between every pair of vectors across two sets.
    '''
    dists= []
    
    for indian in set1:
        
        dist_vec= [(x - indian) for x in set2] #/ np.sum(indian + x)
        dist_vec= [z**2 for z in dist_vec]
        dist_vec= [np.sum(x) for x in dist_vec]
        dists.extend(dist_vec)
    
    return dists


burn_times= [times_dict[x] for x in sorted_avail]
burn_dict= {
    z: [x for x in range(len(burn_times)) if burn_times[x] == z] for z in list(set(burn_times))
}

burn_dict= {
    z: [counts[x] for x in burn_dict[z]] for z in burn_dict.keys()
}

burn_clades= sorted(list(burn_dict.keys()))

diffs= []

for idx in range(1,len(burn_clades)):
    
    ref= burn_clades[idx-1]
    burn= burn_clades[idx]
    
    dists= set_SSD(burn_dict[burn],burn_dict[ref])
        
    diffs.append(dists)


In [41]:
mean_diffs= [np.mean(x) for x in diffs]
std_diffs= [np.std(x) for x in diffs]
surface= burn_clades[1:]

fig= [go.Scatter(
    x= surface,
    y= mean_diffs,
    error_y= dict(
        array= std_diffs,
        type= 'data',
        #symmetric= True,
        visible=True
    )
)]

layout= go.Layout(
    title= 'SFS distances by burnin time',
    xaxis= dict(
        title= 'time before sampling',
        range= [0, max(burn_times) + 1000]
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)

**Fig. 3** SFS convergence between simulated data. Sum of squared differences calculated between windows of adjacent burn-in times. Average and standard deviation plotted. 

> ii. relative to expected SFS.

In [51]:

#### getting a distribution of expected frequency arrays 
#### and the corresponding SFSs.

expected_freq_clade= [np.random.choice(freqs_possible,len(freqs),p=freq_exp) for x in range(10)]
expected_freq_counts= []
for freqs in expected_freq_clade:
    N= len(freqs)
    #n_bins= np.linspace(0,1.01,N_inds)
    #bin_middle= [(n_bins[x] + n_bins[x-1]) / 2 for x in range(1,len(n_bins))]
    #bin_count= [[x for x in freqs if x >= n_bins[z-1] and x < n_bins[z]] for z in range(1,len(n_bins))]
    #bin_count= [len(x) / N for x in bin_count]
    
    bin_count,bin_middle= np.histogram(freqs,bins= N_inds,range= [0,1])
    bin_count= bin_count / sum(bin_count)
    bin_middle= [(bin_middle[x] + bin_middle[x-1]) / 2 for x in range(1,len(bin_middle))]
    bin_count= np.array(bin_count)
    expected_freq_counts.append(bin_count)

### calculating SSDs against set of expected SFSs

diffs= []
for idx in range(1,len(burn_clades)):
    burn= burn_clades[idx]
    
    dists= set_SSD(burn_dict[burn],expected_freq_counts)
        
    diffs.append(dists)


In [52]:
mean_diffs= [np.mean(x) for x in diffs]
std_diffs= [np.std(x) for x in diffs]
surface= burn_clades[1:]

fig= [go.Scatter(
    x= surface,
    y= mean_diffs,
    error_y= dict(
        array= std_diffs,
        type= 'data',
        #symmetric= True,
        visible=True
    )
)]

layout= go.Layout(
    title= 'SFS distances by burnin time',
    xaxis= dict(
        title= 'time before sampling',
        range= [0, max(burn_times) + 1000]
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)

**Fig. 4.** SFS convergence to expectation. Sum of squared differences was calculated against set of SFS vectors drawn using expected frequencies under neutrality. 

> iii. Using calculated expected frequencies instead of samples from the expected distribution.

In [76]:

#### getting a distribution of expected frequency arrays 
#### and the corresponding SFSs.

    expected_freq_counts= [freq_exp]

### calculating SSDs against set of expected SFSs

diffs= []
for idx in range(len(burn_clades)):
    burn= burn_clades[idx]
    
    dists= set_SSD(burn_dict[burn],expected_freq_counts)
    diffs.append(dists)


In [86]:
mean_diffs= [np.mean(x) for x in diffs]
std_diffs= [np.std(x) for x in diffs]
surface= np.array(burn_clades) + 1000

fig= [go.Scatter(
    x= surface,
    y= mean_diffs,
    error_y= dict(
        array= std_diffs,
        type= 'data',
        #symmetric= True,
        visible=True
    )
)]

layout= go.Layout(
    title= 'SFS distances by burnin time',
    xaxis= dict(
        title= 'time before sampling',
        range= [0, max(surface) + 1000]
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)