In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it

from math import sin
import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import scale

from scipy.stats.stats import pearsonr 

from scipy.stats import invgamma 
from scipy.stats import beta
import matplotlib.pyplot as plt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

from IPython.display import clear_output

from Euc_pca_tools.Input_tools import read_Darwin, read_refs
from Euc_pca_tools.Euc_to_fst import Euc_to_fst

In [None]:
whil

## I. Load data.

The simulations performed in this post will be adjusted to the sequence length of the DataMatrix provided. We extracted SNP variation at a number of loci and stored it in DataMatrix format (Darwin software).

We first load and organise our data. All the information available is in the DataMatrix file, which is useful.


In [26]:
## load real data

DM_filename= 'Dmatrices/RC_DataMatrix.txt'
DM_filename= 'Dmatrices/qSH1_DataMatrix.txt'
#DM_filename= 'Dmatrices/random1_DataMatrix.txt'
DM_filename= 'Dmatrices/bh4_DataMatrix.txt'
#DM_filename= 'Dmatrices/Waxy_DataMatrix.txt'
#DM_filename= 'Dmatrices/SH4_DataMatrix.txt'
#DM_filename= 'Dmatrices/Osc1_DataMatrix.txt'
#DM_filename= 'Dmatrices/Jtrop_ind3_chrom11_DataMatrix.txt'
#DM_filename= 'Dmatrices/coord_test_DataMatrix.txt'
data, Names= read_Darwin(DM_filename)
gen_data= np.array(data)

ref_file= 'Complementary_data/refs_CORE.txt' # columns: ID; code. code as preferably as integer.

if ref_file:
    ## read accession data
    ref_lib, codes= read_refs(ref_file)
    ref_names= [z for z in it.chain(*[ref_lib[r] for r in ref_lib.keys()])]
    ref_codes= np.repeat(list(ref_lib.keys()),[len(ref_lib[x]) for x in ref_lib.keys()])

    present= [x for x in range(gen_data.shape[0]) if Names[x] in ref_names] # In case some names are missing from the ref file.

    color_code= [ref_codes[ref_names.index(Names[x])] for x in present]
    color_indexes= {z:[present[x] for x in range(len(color_code)) if color_code[x] == z] for z in list(set(color_code))}
    
    ##
    colors= {
        0:[192,192,192],
        1: [255,0,0],
        2: [0,128,0],
        3: [255,255,0],
        4: [0,0,255],
        5: [255,255,255],
    }
    
    colors= {
        z: 'rgba({},.8)'.format(','.join([str(x) for x in colors[z]])) for z in colors.keys()
    }
    
    ## Currently reference codes are integers. in
    
    ref_names= ['blanc','Indica','black','cAus','Japonica','control']
    

print(gen_data.shape)


(947, 195)


##  III. Simulations. 


The next function generates a number of data sets with a varying number of populations at varying genetic distances. For each data set euclidian distances are calculated in feature space between the centroids of the populations generated.

In Summary, for each iteration, the next blocks will:
- select population vectors from the base data set created in **I.**.
- calculate pairwise Fsts between the selected vectors.
- Generate a varying number of samples from each of the vectors selected.
- perform PCA on each of the generated data sets.
- Calculate pairwise centroid distances between the PCA projections of different haploid populations produced.

First a background data set of population frequency vectors of the appropriate length is generated.

- visit notebook [1 Generate Samples](https://nbviewer.jupyter.org/github/SantosJGND/Genetic-data-analysis/blob/master/Notebooks/1.%20Generating_haplotypes.ipynb) ([Stats Lab](https://github.com/SantosJGND/Stats_Lab)) for more details on the background data set.


In [27]:
Nbranches= 4 # number of axes
L= gen_data.shape[1] # number of markers.
n= 100 # number of frequency vectors.
n_comp_haps= 5
rangeA= [1,2.5] # range along which to vary parameter a of beta dist.
rangeB = [.1,.6] # range along which to vary parameter b of beta dist.
steps= 20 # number of steps along ranges of parameters and b.
n_comp = L # number of components to retain in PCA of frequency vectors (>>).
density= 50 # number of populations along each branch.

from Euc_pca_tools.Generate_freq_vectors import generate_Branches_Beta

features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)

print(features.shape)
print(vector_lib.shape)

(200, 195)
(200, 195)


In [28]:
m_coeff, b, Distances, fst_x, y_true= Euc_to_fst(vector_lib,n_comp= n_comp_haps,ploidy= 2)

Iter: 19, vectors selected: [ 44 183  89], hap length: 195


## IV. Data analysis


We will first confirm that our prediction of genetic distances from PCA euclidian distances is reasonable.

In [29]:
from plotly import tools
from Euc_pca_tools.plot_tools import plot_fst

plot_fst(fst_x,y_true)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



**Fig. True and predicted Fsts.**

### Fst and euclidian genetic distances

## Real data

In this section we apply the relation learned during the preceding simulation step to the study of genetic structure at the chosen locus.

i. We will first perform PCA on the data matrix loaded, together with haplotypes from the two control populations. The data matrix came with labels and individual IDs. These are displayed in the first graph.

ii. We then perform Mean Shift unsupervised clustering to the real data in feature space. MeanShift is a clustering algorithm that relies on the KDE estimate of observed data to identify peaks, to which it assigns observations based on proximity ( ** ). The output of this clusterisation is displayed on the second graph.

iii. For the third part the user is first asked to chose a set of MeanShift clusters. The pairwise distances between the centroids of the chosen clusters will be calculated and their respective Fsts inferred from the relation shown in the above plot. The clusters chosen and the vectors connecting are plotted, annotated with their respective distances.



In [41]:
### Build control data set with apropriate distances.
from Euc_pca_tools.plot_tools import plot_3D
diploid= True

length_haps= gen_data.shape[1]

## Perform PCA with or without including control populations.
gen_data= np.nan_to_num(gen_data)

pca = PCA(n_components=3, whiten=False,svd_solver='randomized').fit(gen_data)
feats= pca.transform(gen_data)
var_comps= pca.explained_variance_ratio_

if ref_file:
     
    plot_3D(feats,color_indexes,colors= colors,Names=Names,var_comps= var_comps,ref_names= ref_names)


plotly.graph_objs.Scene is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.Scene




**Fig. Genetic structure** at locus of choice.

In [45]:
#### MeanShift clusters

N= 50
bandwidth = estimate_bandwidth(feats, quantile=0.1)
params = {'bandwidth': np.linspace(np.min(feats), np.max(feats),20)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

## perform MeanShift clustering.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=25)
ms.fit(feats[present,:])
labels1 = ms.labels_
label_select = {y:[present[x] for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

iu_control= np.triu_indices(len(label_select),1)
MS_centroids= [np.mean(feats[label_select[z],:],axis= 0) for z in label_select.keys()]
MS_pair_dist= pairwise_distances(MS_centroids,metric= 'euclidean')
MS_pair_dist= MS_pair_dist[iu_control]

plot_3D(feats,label_select,Names=Names,var_comps= var_comps)



plotly.graph_objs.Scene is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.Scene




**Fig. Mean Shift clustering of PCA projections** of real data at locus of choice.

Identify accessions at MS cluster of choice.

At the top of the next block, select the clusters between which to calculate distances.

In [33]:
from Euc_pca_tools.plot_tools import plot_vertix
#### Making it interesting:
Vertices= [0,1,2,10]

plot_vertix(feats,label_select,Vertices,m_coeff= m_coeff,b= b, color= '#1f77b4')


plotly.graph_objs.Scene is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.Scene




**Fig. Genetic structure summary** Genetic distances between clusters selected are represented as edges.