In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)



## Coalescence algorithms.

Calculating the probability of sample configuration using population genetics models of mutation. 

    I. Infinite alleles - Ewens, 1972.
        i. recursive
        ii. exact.
       
    II. Infinite sites. 
            Griffiths, Ethier and Tavaré, 1987-1995
            Wu, 2010

- Following Hein, Schierup and Wiuf, 20015. Chapter II. 

### References

- Ewens, W. J. (1972). The sampling theory of selectively neutral alleles. Theoretical population biology, 3(1), 87-112.

- Tavaré, S., Balding, D. J., Griffiths, R. C., & Donnelly, P. (1997). Inferring coalescence times from DNA sequence data. Genetics, 145(2), 505-518.

- Hein, J., Schierup, M., & Wiuf, C. (2004). Gene genealogies, variation and evolution: a primer in coalescent theory. Oxford University Press, USA.

- Wu, Y. (2010). Exact computation of coalescent likelihood for panmictic and subdivided populations under the infinite sites model. IEEE/ACM transactions on computational biology and bioinformatics, 7(4), 611-618.

## Presentation

### co-factors

#### mutation & coalescence

Probability that a either mutation or a coalescent event occurs first when considered backwards in generations. Each modelled after an exponential distribution of intensity:

   - a binomial coefficient, the number of combinations of 2 genes among _k_ possible genes, for coalescence;
    
   - _Theta_, or **4Nu**, the scaled mutation rate, for mutation.


The formulas are derived as the calculation of the minimum of these two quantities, an exponential of intensity equal to the sum of that of its components:

   - _min {Exp(a), Exp(b)} = Exp(a + b)_


In [2]:
def prob_coal(theta,nsamp):
    
    p= (nsamp - 1) / (nsamp - 1 + theta)
    
    return p

def prob_mut(theta,nsamp):
    
    p= theta / (nsamp - 1 + theta)
    
    return p


#### config.

Sample configuration. 

Under the infinite alleles model (Ewens 1972), it is assumed that no spatial or quantitaive information is known about alleles. We know only whether two alleles are different or identical. As a consequence, haplotype data sets can be treated as vectors of length _j_, where each element represents the number of allele classes possessing _j_ members.

As you can imagine, this makes considering the disapearance of alleles, through coalescence of identical forms or disapearnce of singletons through mutation, rather more simple. 

_Yet not so simple that to reproduce the algorithm is straighforward (see below)._

The function `get_config` draws the sample configuration of a numpy array. It considers rows as haplotypes, columns as loci.

In [3]:

def get_config(dataw,nsamp):
    hap_str= [''.join([str(c) for c in x]).strip() for x in dataw]
    hap_str= {z:[x for x in range(nsamp) if hap_str[x] == z] for z in set(hap_str)}
    
    class_len= np.array([len(hap_str[z]) for z in hap_str.keys()])
    
    config= [sum(class_len == x) for x in range(1,nsamp + 1)]
    return config


## Infinite alleles

_Ewens 1972_

    i. Recursion.

Recursion equation of probability of sample configuration as described by Ewens. Equation `2.13` of GGVE (see Index).


In [4]:

def Ewens_recurs(config_vec,theta,prob_array,Pin,prob_bound = 1):
    n_samp= sum([(x + 1) * config_vec[x] for x in range(len(config_vec))])
    
    if config_vec == [1]:
        ## boundary
        
        prob_left= Pin * prob_bound
        
        prob_array.append(prob_left)
        
        return prob_array
    
    if config_vec[0] > 0:
        ## mutation
        prob_left= prob_mut(theta,n_samp)
        
        new_conf= list(config_vec)[:(n_samp - 1)]
        new_conf[0]= new_conf[0] - 1
        
        prob_next= Pin * prob_left
        
        Ewens_recurs(new_conf,theta,prob_array,prob_next)
    
    
    if sum(config_vec[1:]) > 0:
        ## coalesc
        prob_right_I = prob_coal(theta,n_samp)
        
        jsel= [x for x in range(1,len(config_vec)) if config_vec[x] > 0]
        
        for sub in jsel:
            ##  coalesce for all classes still holding more than one allele.
            
            jprop= sub * (config_vec[sub - 1] + 1) / (n_samp - 1)
            
            new_conf= list(config_vec)
            new_conf[sub] -= 1
            new_conf[sub - 1] += 1
            new_conf= new_conf[:(n_samp - 1)]
            
            prob_right= prob_right_I * jprop

            prob_next= Pin * prob_right

            Ewens_recurs(new_conf,theta,prob_array,prob_next)
    
    return prob_array


    ii. Exact formula.

Ewens sampling formula, the exact solution to recursion above. Equation `2.19` of GGVE (see Index).

**verified** the output of this implementation was verified against table 2.1 (p. 48).

In [5]:
import math

def Ewens_exact(config_data,theta):
    
    n_samp= sum([(x + 1) * config_data[x] for x in range(len(config_data))])
    
    ThetaN= [theta + x for x in range(len(config_data))]
    ThetaN0= 1
    for y in ThetaN:
        ThetaN0 = ThetaN0 * y
    
    factor_front= math.factorial(len(config_data)) / ThetaN0
    
    classes= 1
    
    for j in range(len(config_data)):
        comb= theta / (j+1)
        comb= comb**config_data[j]
        
        prod= comb * (1 / math.factorial(config_data[j]))
        
        classes= classes * prod
    
    return factor_front * classes

####
config_trial = [2,0,0,0,0,1,0,0]

theta= 1

Ewens_exact(config_trial,theta)

0.08333333333333333

In [6]:
from structure_tools.Coalesce_plots import plot_Ewens

from plotly import tools

range_theta= np.linspace(.1,5,50)

config_complex= [
    [2,0,0,0,0,1,0,0],
    [4,2,0,0,0,0,0,0],
    [0,0,1,0,1,0,0,0],
    [1,1,0]
]

plot_Ewens(config_complex,range_theta)

['AC: 20000100', 'AC: 42000000', 'AC: 00101000', 'AC: 110']
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



### An attempt at an Infinite-sites algorithm.

#### co-factor.

Update on the `get_config` function to extract as well the set of observations in the numpy array (read haplotypes).


In [7]:

def get_config(dataw,nsamp):
    hap_str= [''.join([str(c) for c in x]).strip() for x in dataw]
    hap_str= {z:[x for x in range(nsamp) if hap_str[x] == z] for z in set(hap_str)}
    
    class_len= np.array([len(hap_str[z]) for z in hap_str.keys()])
    
    config= [sum(class_len == x) for x in range(1,nsamp + 1)]
    return config, hap_str



#### Application

The infinite sites model is difficult to use to estimate the probability of the data. This is because the number of possible states that can give rise to a known configuration rises very quickly as we travel back in generations. This number is increased when taking into account mutation position and sequence label. 

From GGVE:
    
   _If the model had been a sequence 1000 bp long with four nucleotides, the full history would have had 4^1000 ≈ 10^600 states_

Since the limitation is in computation time and memory, there have been attempts to accelerate / lighten this task through clever use of data structures and algorithms. My own attempt here is somewhat inspired by the dynamic algorithm proposed by Wu (2010).

My idea rests on a partition of previous generations / steps into layers. A haplotype data set is stored at each layer and ancestral states are stored only as the index and number of the haplotypes they represent. The `hap` set is updated with every new haplotype produced by the removal of a mutation. A dictionary of pointers is created to store information on how to travel along the ancestry tree created. 


In [20]:
import time
from sklearn.metrics import pairwise_distances


def Inf_sites(Dict_mat,point_up,point_dn,layer_range= 10, sub_sample= 0, poppit= False):
    
    t1 = time.time()
   
    MRCA= False
    
    layer= 0
    
    
    for layer in range(layer_range):
        
        if MRCA:
            continue
            
        print('layer: {}; len: {}'.format(layer,len(Dict_mat[layer])-1))
        
        if len(Dict_mat[layer]) == 2:
            stdlone= max(Dict_mat[layer].keys())
            if sum(Dict_mat[layer][stdlone][:,1]) == 1:
                MRCA = True
                continue

        if poppit:
            if layer > 1:
                Dict_mat.pop(layer - 1)
            
        hap= list(Dict_mat[layer][-2])
        hap_coord= {}
        
        point_up[layer]= []
        
        Dict_mat[layer + 1]= {   
        }
        point_dn[layer + 1]= []
        
        Quasi= []
        nodes= []
        new_haps= []
        
        keys_get= list(Dict_mat[layer].keys())
        
        if sub_sample:
            keys_get= np.random.choice(keys_get,sub_sample)
        
        for desc in keys_get:
            
            if desc >= 0:
                
                packet= list(Dict_mat[layer][desc])
                packet= np.array(packet)

                pack_above= [x for x in range(packet.shape[0]) if packet[x,1] > 1]
                pack_below= [x for x in range(packet.shape[0]) if packet[x,1] == 1]
                
                new_entries= np.array(list(range(len(pack_above)))) + len(Dict_mat[layer + 1])
                
                who_loses= []
                
                ### Coalescence
                for z in range(len(pack_above)):
                    
                    who_loses.append(packet[pack_above[z],0])
                    
                    pack_synth= list(packet)
                    pack_synth= np.array(pack_synth)

                    pack_synth[pack_above[z],1] -= 1
                    
                    pack_tuple= sorted([tuple(x) for x in pack_synth])
                    
                    Query= [pack_tuple == x for x in Quasi]
                    Query= np.array(Query,dtype= int)
                    Query= np.where(Query == 1)[0] ## check if this changes anything
                    
                    if len(Query):
                        new_entries[z] = nodes[Query[0]]
                        
                    else:
                        pack_synth= np.array([list(x) for x in pack_tuple])
                        
                        pack_synth= pack_synth[pack_synth[:,1] > 0]
                        Dict_mat[layer + 1][new_entries[z]]= pack_synth
                        Quasi.append(pack_tuple)
                        nodes.append(new_entries[z])
                                
                packet_mob= packet[pack_above,:]
                
                packet_mob[:,1] = 1
                
                packet_mob= np.hstack((np.zeros((packet_mob.shape[0], 1), dtype=packet_mob.dtype),packet_mob))
                packet_mob= np.hstack((packet_mob,np.zeros((packet_mob.shape[0], 1), dtype=packet_mob.dtype)))
                packet_mob[:,3] = -1 #######
                packet_mob[:,0]= new_entries
                packet_mob= np.hstack((np.zeros((packet_mob.shape[0], 1), dtype=packet_mob.dtype),packet_mob))
                packet_mob[:,0]= desc
                
                for y in packet_mob:
                    point_up[layer].append(y)
                
                pocket_mob= packet[pack_above,:]
                pocket_mob= np.hstack((np.zeros((pocket_mob.shape[0], 1), dtype=pocket_mob.dtype),pocket_mob))
                pocket_mob= np.hstack((pocket_mob,np.zeros((pocket_mob.shape[0], 1), dtype=pocket_mob.dtype)))
                pocket_mob[:,3] = -1
                pocket_mob[:,2] = 1
                pocket_mob[:,0] = desc
                point_dn[layer + 1].extend(pocket_mob)
                
                ## muts that can be removed. Assume mutations happen only once.
                exist= np.array(packet)[:,0]
                exist= np.array(hap)[exist,:]
                single= np.sum(exist,axis= 0)
                single= np.where(single==1)[0]
                ##
                    
                for edan in pack_below:
                    #
                    seq= hap[packet[edan,0]]
                    
                    #print(seq)
                    who= np.where(seq == 1)[0]
                    
                    who= [x for x in who if x in single]
                    
                    if len(who) == 0:
                        continue
                    
                    #print('seq: {}, where: {}'.format(seq,who))
                                        
                    for mut in who:
                        
                        tribia= list(seq)
                        tribia= np.array(tribia)
                        tribia[mut]= 0

                        calc= pairwise_distances(np.array(tribia).reshape(1,-1), hap,
                                                        metric='hamming')[0]
                        
                        match= [x for x in range(len(calc)) if calc[x] == 0] 
                        
                        if len(match):
                            #print(match)
                                                        
                            for cl in match:
                                
                                pack_synth= list(Dict_mat[layer][desc])
                                pack_synth= np.array(pack_synth)
                                pack_synth[edan,1] -= 1
                                pack_synth= pack_synth[pack_synth[:,1] > 0]
                                
                                if cl in pack_synth[:,0]:
                                    cl_idx= list(pack_synth[:,0]).index(cl)
                                    pack_synth[cl_idx,1] += 1
                                    
                                else:
                                    new_row= np.array([cl,1])
                                    pack_synth= np.vstack((pack_synth,new_row.reshape(1,-1)))
                                
                                #### make function Query existant
                                new_entry= len(Dict_mat[layer + 1])
                                while new_entry in Dict_mat[layer + 1].keys():
                                    new_entry += 1
                                
                                ###
                                path_find= 0 #########
                                pack_tuple= sorted([tuple(x) for x in pack_synth])

                                Query= [pack_tuple == x for x in Quasi]
                                Query= np.array(Query,dtype= int)
                                Query= np.where(Query == 1)[0] ## check if this changes anything

                                if len(Query):
                                    new_entry= nodes[Query[0]]

                                else:
                                    #print(pack_synth)
                                    pack_synth= np.array([list(x) for x in pack_tuple])
                                    Dict_mat[layer + 1][new_entry]= pack_synth
                                    Quasi.append(pack_tuple)
                                    nodes.append(new_entry)
                                ### 

                                point_up[layer].append([desc,new_entry,cl,path_find,mut]) ############
                                point_dn[layer + 1].append([desc,cl,0,mut]) 
                        
                        else:
                            #
                            if len(new_haps):
                                #
                                calc= pairwise_distances(np.array(tribia).reshape(1,-1), np.array(new_haps),
                                                                                        metric='hamming')[0]
                                
                                match= [x for x in range(len(calc)) if calc[x] == 0]
                                
                                if len(match):
                                    
                                    new_idx= len(hap) + match[0]
                                
                                else:
                                    new_haps.append(tribia)
                                    new_idx= len(hap) + len(new_haps) - 1
                            
                            else:
                                new_haps.append(tribia)
                                new_idx= len(hap)
                            
                            #
                            pack_synth= list(Dict_mat[layer][desc])
                            pack_synth.append([new_idx,1]) # pack_synth.append([len(pack_synth),1])
                            pack_synth= np.array(pack_synth)
                            pack_synth[edan,1] -= 1
                            pack_synth= pack_synth[pack_synth[:,1] > 0]
                            
                            #### make function Query existant
                            new_entry= len(Dict_mat[layer + 1])
                            while new_entry in Dict_mat[layer + 1].keys():
                                new_entry += 1
                            
                            ###
                            path_find= 0 #########
                            pack_tuple= sorted([tuple(x) for x in pack_synth])

                            Query= [pack_tuple == x for x in Quasi]
                            Query= np.array(Query,dtype= int)
                            Query= np.where(Query == 1)[0] ## check if this changes anything

                            if len(Query):
                                new_entry = nodes[Query[0]]

                            else:
                                
                                pack_synth= np.array([list(x) for x in pack_tuple])
                                Dict_mat[layer + 1][new_entry]= pack_synth
                                Quasi.append(pack_tuple)
                                nodes.append(new_entry)

                            ####
                            point_up[layer].append([desc,new_entry,new_idx,path_find,mut])
                            point_dn[layer + 1].append([desc,new_idx,0,mut])
        
        if new_haps:
            
            hap.extend(new_haps)
        
        point_up[layer]= np.array(point_up[layer])
        point_dn[layer+1]= np.array(point_dn[layer+1])
        Dict_mat[layer + 1][-2] = np.array(hap)
        
        layer += 1
    
    t2 = time.time()
    tscale= 's'
    tpass= t2 - t1
    
    if tpass > 600:
        tpass = tpass / 60
        tscale= 'm'
    
    tpass= round(tpass,3)
    
    print('time elapsed: {} {}'.format(tpass,tscale))
    
    return Dict_mat, point_up, point_dn



### testing

Prepare a data set and dictionaries that feed into the algorithm.  Below, the first dataset presented is meant to emulate the Ancestral Configuration `a(1,1,0)` already seen in the infinite alleles section. The second data set was taken from figure 2.10 (GGVE, pp. 52).

In [21]:
###Generate data from config

dataT= [
    [1,1,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

### example from figure 2.10.

dataT= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

dataT= np.array(dataT)

nsamp= dataT.shape[0]

config_dataw, hap_str= get_config(dataT,nsamp)


hap_sol= list(hap_str.keys())
hap_sun= np.array([np.array(list(x),dtype= int) for x in hap_sol])

hap_size= [len(hap_str[x]) for x in hap_sol]
hap_size= {z:[x for x in range(len(hap_size)) if hap_size[x] == z] for z in list(set(hap_size))}



passing= hap_size.keys()
pack= list(it.chain(*[hap_size[x] for x in passing]))
passport= list(it.chain(*[[x]*len(hap_size[x]) for x in passing]))

pack= [[pack[x],passport[x]] for x in range(len(pack))]
pack= sorted(pack)
pack= np.array(pack)

Dict_mat= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }

point_up= recursively_default_dict()

point_dn= recursively_default_dict()


In [22]:
root_lib, point_up, point_dn = Inf_sites(Dict_mat,point_up,point_dn,layer_range= 10,sub_sample= 0,poppit= False)

layer: 0; len: 2
layer: 1; len: 2
layer: 2; len: 3
layer: 3; len: 5
layer: 4; len: 5
layer: 5; len: 5
layer: 6; len: 4
layer: 7; len: 1
layer: 8; len: 1
time elapsed: 0.033 s


Algorithm appears successful. Number of layers and number of ACs per layer correspond to those in `Figure 2.10` (GGVE,p. 52)

_a lesson learned on impossible ancestral states_ 

Accounted for here by eliminating only those mutations carried by a unique singleton. 

#### Traversing the Tree.

The set of AC connections produced by the algorithm above is independent of coalescent and mutation probabilities. Simply, it holds (if it is successful, which cannot be said at the present, but anticipating a breakthrough), all possible states and connections given the observed data. 

In fact, to calculate the probability of this data, we do not need to revisit the ACs created. We will use the library of pointers, which holds the nodes at each layer, the edges that connect them, and a binary marker indicating wether the link represents a mutation or coalescence envent. 



In [25]:
root_lib[7]

{-2: array([[1, 1, 0, 1],
        [0, 0, 0, 0],
        [1, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0]]), 0: array([[1, 2]])}

In [26]:
#### Getting probab

def runUp(Up_lib,layer,start,Theta= 1,probs= [],prob_vec= [],Pin= 1):
    
    if not len(Up_lib[layer]):
        
        prob_vec.append(Pin)
        
        return prob_vec
    
    Ways= Up_lib[layer]
    Ways= Ways[Ways[:,0] == start]
    
    for row in range(Ways.shape[0]):
        action= Ways[row,3]
        
        probe= probs[action]
        next_stop= Ways[row,1]
        new_pin= Pin * probe

        runUp(Up_lib,layer + 1,next_stop,Theta= Theta,probs= probs,prob_vec= prob_vec,Pin= new_pin)
    
    return prob_vec


### wasn't taking into account the probability of each given the population size.

def runUp_balance(Up_lib,Root,layer,start,Theta= 1,probs= [],prob_vec= [],Pin= 1):
    
    if not len(Up_lib[layer]):
        
        prob_vec.append(Pin)
        
        return prob_vec
    
    Ways= Up_lib[layer]
    Ways= Ways[Ways[:,0] == start]
    
    for row in range(Ways.shape[0]):
        action= Ways[row,3]
        
        ## identifying the next node.
        next_stop= Ways[row,1]
        node_next= Root[layer + 1][next_stop]
        
        ## calculate mut. and coal. probs based on next node sample size. 
        nsamp= sum(node_next[:,1])
        
        mut_prob= prob_mut(Theta,nsamp)
        coal_prob= prob_coal(Theta,nsamp)

        ### Mut was coded to 0, coalescence to 1.
        probs= [mut_prob,coal_prob]
        
        probe= probs[action] # edge = [mutation, coalescence] 
        
        ###
        
        who_lost= Ways[row,2] # hap set that originates the mutation / coalescent event
        hap_split= node_next[node_next[:,0] == who_lost] # hap set row
        
        if action == 1:
            # coalescence 
            prob_split= (hap_split[0,1]) / sum(node_next[:,1]) # proportion of ancestral hap set in previous AC
            
            probe= probe * prob_split
        
        if action == 0:
            # mutation
            
            prob_split= (hap_split[0,1]) / (sum(node_next[:,1])) # probability that this particular hap mutated.
            
            probe= probe * prob_split 
        
            
        ###
            
        new_pin= Pin * probe ## Probability inheritance.
        

        runUp(Up_lib,layer + 1,next_stop,Theta= Theta,probs= probs,prob_vec= prob_vec,Pin= new_pin)
    
    return prob_vec



### Theta

Probability of the data under varying values of _Theta_.

In [27]:
range_theta= np.linspace(.1,10,100)

Nsamp= dataT.shape[0]

Inf_sites_est= []
there= []

for x in range_theta:
    
    mut_prob= prob_mut(x,Nsamp)
    coal_prob= prob_coal(x,Nsamp)

    ## Mut was set to 0, coalescence to 1.
    probs_here= [mut_prob,coal_prob]

    
    #Browse= runUp(point_up,0,0,Theta= x,probs= probs_here,prob_vec= [],Pin= 1)
    Browse= runUp_balance(point_up,root_lib,0,0,Theta= x,probs= probs_here,prob_vec= [],Pin= 1)
    #print(len(Browse))
    probe_rec= sum(Browse)
    
    Inf_sites_est.append(probe_rec)
    there.append(x)


In [28]:

fig_here= [go.Scatter(
    y= Inf_sites_est,
    x= there,
    mode= 'markers'
)]

layout= go.Layout(
    xaxis= dict(
        range= [-.5,max(range_theta)],
        title= 'Theta'
    ),
    yaxis= dict(
        range= [0,max(Inf_sites_est) + .05],
        title= 'P'
    )
    
)

Figure= go.Figure(data=fig_here,layout= layout)

iplot(Figure)


Still missing something. The tree structure seems fine so the probem is in edge weight calculation. According to Figure 2.16 of GGVE, maximum values of theta are around 2.12, with probabilities under 10e-3.

In [29]:
from structure_tools.vcf_geno_tools import read_geno_nanum

Home= 'vcf/'
Chr= 8
filename= Home + 'Extract_Chr{}_15000.vcf'.format(Chr)

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(filename, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))


{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [30]:
## read passport information

Input_file= '3K_info.txt'

RG_info= pd.read_csv('3K_info.txt',sep= '\t')

RG_info.head()

Unnamed: 0,IRIS_ID,NAME,Variety_Name_verif,COUNTRY,REGION,K9_cluster,Initial_subpop
0,B001,HEIBIAO,Heibiao,China,As6,GJ-tmp,temp
1,B002,SANSUIJIN,Sansuijin,China,As6,GJ-tmp,temp
2,B003,ZAOSHENGBAI,Zaoshengbai_,China,As6,GJ-adm,japx
3,B004,QIUGUANGTENGXI_104,Qiuguangtengxi_104_,Japan,As7,GJ-tmp,temp
4,B005,WANSHI,Wanshi,Japan,As7,GJ-tmp,temp


In [31]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.

for x in range(len(Names)):
    ind= Names[x]
    newid= ind.split('_')
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names[x]= newid



In [32]:
from structure_tools.vcf_geno_tools import geno_subset_random

Sn= 500
Sm= 11000

ID_col= 'IRIS_ID'
subset_col= 'Initial_subpop'

code= {
    'ind1A':0,
    'ind1B':0,
    'ind2':0,
    'ind3':0,
    'aus':1,
    'temp':2,
    'trop':2,
    'subtrop':2,
    'aro': 3,
    'admx': 4
}


others= 'admx'

#gen_sample, subsummary, code_vec, code_lib, Nsample, Msample

gen_sample, subsummary, code_vec, code_lib, Nsample, Msample= geno_subset_random(genotype,summary, RG_info, ID_col,subset_col, Names,code=code, Sn= Sn, Sm= Sm)

color_groups= ['red','yellow','blue','green','purple','black','silver','silver','red3','deepskyeblue','navy','chartreuse','darkorchid3','goldenrod2']



gen_sample shape: 500, 11000


### B. Global variation

#### i. PCA

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These can later be used for supervised analysis.

In [33]:

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## Perform PCA
n_comp= 5
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(gen_sample)

## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.15)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

###
from structure_tools.Tutorial_subplots import plot_global_classes


plot_global_classes(feats,
                    code_lib,
                    label_select,
                    color_groups,
                    PCA_color_ref,
                    title_I= 'IRRI class',
                    title_II= 'Mean_shift',height= 400, width= 950)


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



In [38]:
references= ['Local','External']

chose_refs= 1

ref_chosen= references[chose_refs]

if ref_chosen== 'Local':
    ref_dict= label_select
    ref_vector= labels1

if ref_chosen== 'External':
    ref_dict= code_lib
    ref_vector= code_vec

In [44]:
from structure_tools.vcf_geno_tools import geno_window_split
##### 
window_size= 25
Steps= 14

Windows, Out= geno_window_split(gen_sample,
                                subsummary,
                                Steps= Steps,
                                window_size=window_size)

print('number of chromosomes: {}'.format(len(Windows)))
print('number of windows: {}'.format(sum([len(Windows[x].keys()) for x in Windows.keys()])))


number of chromosomes: 1
number of windows: 785


In [45]:
some_windows= np.random.choice(list(Windows[8].keys()),5)
some_windows

array([25195467, 17091752,  8570355, 23791877, 15687409])

In [60]:
wind_select= 15687409

popA= 1

data_w= Windows[Chr][wind_select]
data_w= data_w[code_lib[popA]]
data_w[data_w==1]= 0
data_w[data_w==2]= 1

In [61]:
dataT= data_w

nsamp= dataT.shape[0]

config_dataw, hap_str= get_config(dataT,nsamp)


hap_sol= list(hap_str.keys())
hap_sun= np.array([np.array(list(x),dtype= int) for x in hap_sol])

hap_size= [len(hap_str[x]) for x in hap_sol]
hap_size= {z:[x for x in range(len(hap_size)) if hap_size[x] == z] for z in list(set(hap_size))}



passing= hap_size.keys()
pack= list(it.chain(*[hap_size[x] for x in passing]))
passport= list(it.chain(*[[x]*len(hap_size[x]) for x in passing]))

pack= [[pack[x],passport[x]] for x in range(len(pack))]
pack= sorted(pack)
pack= np.array(pack)

Dict_mat= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }

point_up= recursively_default_dict()

point_dn= recursively_default_dict()


In [62]:
root_lib, point_up, point_dn = Inf_sites(Dict_mat,point_up,point_dn,layer_range= 54,sub_sample= 0,poppit= False)

layer: 0; len: 2
layer: 1; len: 5
layer: 2; len: 15
layer: 3; len: 33
layer: 4; len: 59
layer: 5; len: 95
layer: 6; len: 140
layer: 7; len: 193
layer: 8; len: 252
layer: 9; len: 315
layer: 10; len: 381
layer: 11; len: 449
layer: 12; len: 521
layer: 13; len: 598
layer: 14; len: 675
layer: 15; len: 747
layer: 16; len: 809
layer: 17; len: 857
layer: 18; len: 889
layer: 19; len: 900
layer: 20; len: 893
layer: 21; len: 870
layer: 22; len: 829
layer: 23; len: 778
layer: 24; len: 722
layer: 25; len: 662
layer: 26; len: 596
layer: 27; len: 526
layer: 28; len: 456
layer: 29; len: 384
layer: 30; len: 312
layer: 31; len: 242
layer: 32; len: 181
layer: 33; len: 130
layer: 34; len: 88
layer: 35; len: 57
layer: 36; len: 35
layer: 37; len: 20
layer: 38; len: 12
layer: 39; len: 6
layer: 40; len: 4
layer: 41; len: 1
layer: 42; len: 1
time elapsed: 19.177 s


In [59]:
range_theta= np.linspace(.01,10,100)

Nsamp= dataT.shape[0]
from IPython.display import clear_output

Inf_sites_est= []
there= []

for th in range_theta:
    
    mut_prob= prob_mut(th,Nsamp)
    coal_prob= prob_coal(th,Nsamp)

    ## Mut was set to 0, coalescence to 1.
    probs_here= [mut_prob,coal_prob]

    
    #Browse= runUp(point_up,0,0,Theta= x,probs= probs_here,prob_vec= [],Pin= 1)
    Browse= runUp_balance(point_up,root_lib,0,0,Theta= th,probs= probs_here,prob_vec= [],Pin= 1)
    #print(len(Browse))
    probe_rec= sum(Browse)
    
    print('theta: {}'.format(th))
    
    Inf_sites_est.append(probe_rec)
    there.append(th)
    
    clear_output()


KeyboardInterrupt: 

In [None]:

fig_here= [go.Scatter(
    y= Inf_sites_est,
    x= there,
    mode= 'markers'
)]

layout= go.Layout(
    xaxis= dict(
        range= [-.5,max(range_theta)],
        title= 'Theta'
    ),
    yaxis= dict(
        range= [0,max(Inf_sites_est) + .05],
        title= 'P'
    )
    
)

Figure= go.Figure(data=fig_here,layout= layout)

iplot(Figure)


In [84]:
point_up[16]

array([[ 0,  0,  2,  1, -1]])