In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [2]:
from structure_tools.vcf_geno_tools import simple_read_vcf

vcf_file= 'data_cleanRefs_simple_Admx.vcf'

genotype, summary, info_save= simple_read_vcf(vcf_file,row_info= 5,header_info= 9,phased= True)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

Number of markers: 40000
Number of individuals: 130


In [3]:
summary.head()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,1,8,1,A,T,.,PASS,.,GT:AD:DP
1,1,33,2,A,T,.,PASS,.,GT:AD:DP
2,1,74,3,A,T,.,PASS,.,GT:AD:DP
3,1,83,4,A,T,.,PASS,.,GT:AD:DP
4,1,87,5,A,T,.,PASS,.,GT:AD:DP


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [4]:
n_comp= 3

In [5]:
from structure_tools.Tutorial_subplots import plot_global_pca

## Perform PCA
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(genotype)

In [6]:
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.1)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=45)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}
###

In [7]:
###
plot_global_pca(feats,label_select,PCA_color_ref,title= 'global_pca',height= 500,width= 950)

In [33]:
select_refs= [0,1,2]
label_vector= [[len(select_refs),labels1[x]][int(labels1[x] in select_refs)] for x in range(genotype.shape[0])]

Whose= list(range(genotype.shape[0]))


## Imputation

### Creating a matrix, introducing missing values

In [140]:
print('full data set shape: {}'.format(genotype.shape))

nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)

nan_coords= [ynan,xnan]
nan_coords= np.array(nan_coords).T

print(nan_coords)


full data set shape: (130, 40000)
[[  27 5150]]


In [141]:
from sklearn.metrics import pairwise_distances

nan_idx= 0

wind_sizes= 100
Nreps= 400
ncomps= 5
dimN= 2

metric= 'euclidean'

nan_obs= nan_coords[nan_idx]
nan_acc= nan_obs[0]
nan_pos= nan_obs[1]

other_obs= [x for x in range(genotype.shape[0]) if x != nan_acc]

dist_store= []

for idx in range(Nreps):
    
    st= -1
    while st == -1:
        stp= np.random.randint(0,genotype.shape[1]-wind_sizes)
        dinc= nan_pos - stp
        if dinc > wind_sizes or dinc < 0:
            st= stp
    
    nwind= genotype[:,stp:(stp+wind_sizes)]
    
    pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
    featw= pca2.fit_transform(nwind)
    
    obsn= featw[nan_acc,:dimN].reshape(1,-1)
    dist_vec= pairwise_distances(obsn, featw[other_obs,:dimN],
                                                metric=metric)
    
    dist_store.extend(dist_vec)
    
dist_store= np.array(dist_store)
dist_store.shape

(400, 129)

In [142]:

pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
featd= pca2.fit_transform(dist_store)
bandwidth = estimate_bandwidth(featd, quantile=0.25)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=35)
ms.fit(featd)
labelsf = ms.labels_
labelf_select = {y:[x for x in range(len(labelsf)) if labelsf[x] == y] for y in sorted(list(set(labelsf)))}


fig= [go.Scatter(
    x= featd[labelf_select[i],0],
    y= featd[labelf_select[i],1],
    mode= 'markers',
    name= str(i)
) for i in labelf_select.keys()]

layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)

iplot(Figure)


In [145]:
#dist_store_norm= 
dist_var= np.std(dist_store,axis= 0)**2
who_plot= [x for x in range(len(label_vector)) if label_vector[x] in [0,1,2,3]]
idvector= ['pop{}_{}'.format(label_vector[x],Whose[x]) for x in who_plot]

fig = go.Figure([go.Bar(
    x=idvector, y=dist_var
)])
layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)
iplot(Figure)

In [146]:
local_l= genotype[:,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]
coords= {z:[x for x in range(len(label_vector)) if label_vector[x] == z] for z in list(set(label_vector))}

pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
featl= pca2.fit_transform(local_l)

fig= [go.Scatter(
    x= featl[coords[i],0],
    y= featl[coords[i],1],
    mode= 'markers',
    name= str(i)
) for i in coords.keys()]

fig.append(go.Scatter(
    mode='markers',
    x=[featl[nan_acc,0]],
    y=[featl[nan_acc,1]],
    marker=dict(
        color='LightSkyBlue',
        size=20,
        line=dict(
            color='red',
            width=7
        )
    ),
    showlegend=False
))

layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)

iplot(Figure)

### Distance regression

### Grid approach

In [147]:
Quanted_set= np.array(featl)
P= 20
dimN= 2

to_mesh= [np.linspace(min(Quanted_set[:,x]),max(Quanted_set[:,x]),P) for x in range(dimN)]

coords_net = np.meshgrid(*to_mesh, indexing= 'ij')

pprod= [list(range(P)) for y in range(dimN)]
traces= [x for x in it.product(*pprod)]

background= np.array(coords_net)

background= background.reshape(dimN,np.prod(background.shape[1:])).T
background.shape

(400, 2)

In [148]:
# subselect by variance in distances:

## perform MeanShift clustering.
bandwidth = estimate_bandwidth(featl, quantile=0.2)
#bandwidth = estimate_bandwidth(dist_var.reshape(-1,1), quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=35)
featlt= pca2.fit_transform(local_l[other_obs])
ms.fit(featlt) #(dist_var.reshape(-1,1))
labels_std = ms.labels_
std_select = {y:[x for x in range(len(labels_std)) if labels_std[x] == y] for y in sorted(list(set(labels_std)))}
std_select.keys()

std_gpmeans= {z: np.mean([dist_var[x] for x in g]) for z,g in std_select.items() if z != -1}

std_gp_use= sorted(std_gpmeans,key= std_gpmeans.get)
d= 0
idx = 0
while d != 1:
    g=std_select[std_gp_use[idx]] 
    
    if len(g) >= 15:
        std_gp_use= list(g)
        d= 1
        
    idx+= 1
#std_gp_use= std_select[std_gp_use]

#std_gp_use= [other_obs[x] for x in std_gp_use]
#std_gp_use= list(range(len(labels_std)))


In [149]:
workfeat= featl[std_gp_use,:dimN]

dist_grid= pairwise_distances(background, workfeat,
                                            metric=metric)

dist_grid.shape

(400, 49)

In [150]:
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

dist_ref_select= 0

dist_comps= 10
Bandwidth_split = 30

####
####
dist_ref= dist_store[labelf_select[dist_ref_select],:]
dist_ref= dist_ref[:,std_gp_use]
pca2 = PCA(n_components=dist_comps, whiten=False,svd_solver='randomized')

pca_dists= pca2.fit(dist_grid)
featw= pca_dists.transform(dist_grid)
featref= pca2.transform(dist_ref)

params = {'bandwidth': np.linspace(np.min(featref), np.max(featref),Bandwidth_split)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0,cv= 3,iid= False)

grid.fit(featref)
kde = grid.best_estimator_

grid_likes= kde.score_samples(featw)
grid_likes= np.exp(grid_likes)


In [151]:
fig= [go.Scatter3d(
    x= featref[:,0],
    y= featref[:,1],
    z= featref[:,2],
    mode= 'markers',
    marker= {
        'color':grid_likes,
        'colorbar': go.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 15,
        'symbol': 'circle',
      "opacity": 1
      }
)]


layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)

iplot(Figure)


plotly.graph_objs.ColorBar is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.marker.ColorBar
  - plotly.graph_objs.surface.ColorBar
  - etc.




In [153]:
dist_mean= -np.min(dist_grid,axis= 1)
dist_mean= np.exp(dist_mean)
print(dist_mean.shape)
fig= [go.Scatter3d(
    x= background[:,0],
    y= background[:,1],
    z= grid_likes,
    mode= 'markers',
    marker= {
        'color':grid_likes,
        'colorbar': go.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 25,
        'symbol': 'circle',
      "opacity": 1
      }
)]

layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)

iplot(Figure)

(400,)
