In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## PCA inverse transformation.

Principal component analysis (PCA) is an extremely useful tool in population genetics. Indeed in any study to require
a summary of sample covariance. In population genetics the variance between populations is informative of the evolutionary 
history captured in a given data sets. In conditions of pure diffusion a direct relationship can even be found between divergence, as measured in coalescent time or Fst. 

The possibility of working in an euclidean space also opens the door for density based methods.

Another interesting side to PC transformation is the possibility of inverse transformation to obtain a raw data from feature space coordinates. 

However, it is likely that the accuracy of inverse transformation depends not only on the information retained during transformation but the information present in the data set itself.

Before we incorporate PCA inverse transformation in our analyses, its limits must be understood. 

In this notebook we will study the accuracy of inverse transformation in reproducing raw SNP data. 



### vcf Data

Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

#### Input

In [2]:
from structure_tools.vcf_geno_tools import simple_read_vcf

vcf_file= 'data_cleanRefs_simple_Admx.vcf'

genotype, summary, info_save= simple_read_vcf(vcf_file,row_info= 5,header_info= 9,phased= True)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

Number of markers: 40000
Number of individuals: 130


In [3]:
summary.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,1,8,1,A,T,.,PASS,.,GT:AD:DP
1,1,33,2,A,T,.,PASS,.,GT:AD:DP
2,1,74,3,A,T,.,PASS,.,GT:AD:DP
3,1,83,4,A,T,.,PASS,.,GT:AD:DP
4,1,87,5,A,T,.,PASS,.,GT:AD:DP


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [4]:

## Perform PCA
n_comp= 3
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(genotype)

In [5]:
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.1)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=45)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}
###

In [6]:
from structure_tools.Tutorial_subplots import plot_global_pca

plot_global_pca(feats,label_select,PCA_color_ref,title= 'global_pca',height= 500,width= 950)

In [7]:
select_refs= [0,1,2]
label_vector= [[len(select_refs),labels1[x]][int(labels1[x] in select_refs)] for x in range(genotype.shape[0])]

Whose= list(range(genotype.shape[0]))


## Inverse Transformation

In [8]:
print('full data set shape: {}'.format(genotype.shape))

nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)

nan_coords= [ynan,xnan]
nan_coords= np.array(nan_coords).T

print(nan_coords)


full data set shape: (130, 40000)
[[   53 35054]]


In [9]:
nan_idx= 0

nan_obs= nan_coords[nan_idx]
nan_acc= nan_obs[0]
nan_pos= nan_obs[1]

wind_sizes= 100
Nreps= 400
ncomps= 5
dimN= 2
metric= 'euclidean'


In [10]:
local_l= genotype[:,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]
coords= {z:[x for x in range(len(label_vector)) if label_vector[x] == z] for z in list(set(label_vector))}

pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
featl= pca2.fit_transform(local_l)

figwl= [go.Scatter(
    x= featl[coords[i],0],
    y= featl[coords[i],1],
    mode= 'markers',
    name= str(i)
) for i in coords.keys()]

figwl.append(go.Scatter(
    mode='markers',
    x=[featl[nan_acc,0]],
    y=[featl[nan_acc,1]],
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size=25,
        opacity= 1,
        line=dict(
            color='red',
            width=5
        )
    ),
    showlegend=False
))

layout= go.Layout()

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [11]:
###
hap_check= local_l[nan_acc]

other_obs= [x for x in range(genotype.shape[0]) if x != nan_acc]
###
ncomp= 130
ploidy= 2

nan_wind= genotype[:,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]

pca_miss = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
pca_miss.fit(nan_wind)
inv_hap= pca_miss.inverse_transform(featl[nan_acc])
inv_hap= np.array(inv_hap, dtype= int)
inv_hap= inv_hap.reshape(1,-1) * ploidy

print(hap_check)

[0 2 0 2 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0
 0 0 0 0 2 0 0 2 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0
 0 0 0 2 2 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 2]


In [12]:
print(inv_hap)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
  0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2
  0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2]]


In [13]:
feat_db= pca_miss.transform(inv_hap)
feat_db.shape

(1, 5)

In [14]:

figwl= [go.Scatter(
    x= featl[coords[i],0],
    y= featl[coords[i],1],
    mode= 'markers',
    name= str(i)
) for i in coords.keys()]

figwl.append(go.Scatter(
    mode='markers',
    x=[featl[nan_acc,0]],
    y=[featl[nan_acc,1]],
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size=25,
        opacity= 1,
        line=dict(
            color='red',
            width=5
        )
    ),
    name= '',
    showlegend=False
))


figwl.append(go.Scatter(
    mode='markers',
    x=[feat_db[0,0]],
    y=[feat_db[0,1]],
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size=25,
        opacity= 1,
        line=dict(
            color='blue',
            width=5
        )
    ),
    name= 'inverse transform',
    showlegend=False
))

layout= go.Layout()

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

## Varying sample and feature number

In [327]:

nan_n= 30

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)

nan_coords= [ynan,xnan]
nan_coords= np.array(nan_coords).T

print(nan_coords[:4])


[[   64 21125]
 [   91 20228]
 [   92  6786]
 [   30 37432]]


In [340]:
def inv_reproduce(nan_wind,
                    nan_acc, 
                    featl,
                    local_l,
                    other_obs,
                    ncomps= 5,
                    metric= "euclidean",
                    ploidy= 2,
                    P= 15):
    '''
    using data coordinate, genotype array:
    - extract window of size wind_sizes around feature index.
    - Dr window, use PCA to inverse transform index observation, re-transform.
    '''
    ##
    ###
    pca_miss = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
    pca_miss.fit(nan_wind)
    inv_hap= pca_miss.inverse_transform(featl[nan_acc])
    inv_hap= np.array(inv_hap, dtype= int)
    inv_hap= inv_hap.reshape(1,-1) * ploidy
    
    feat_db= pca_miss.transform(inv_hap)
    
    ##
    hap_pack= [hap_check,inv_hap[0]]
    hap_pack= np.array(hap_pack)
    
    feat_pack= [featl[nan_acc],feat_db[0]]
    feat_pack= np.array(feat_pack)
    
    feat_dist= pairwise_distances(feat_pack,metric= metric)[0,1]
    #background= get_bg_grid(featl, P= P, dimN= 3)
    control_dists= pairwise_distances(inv_hap,local_l[other_obs],metric= 'manhattan')[0]

    return hap_pack, feat_pack, control_dists



In [348]:
import itertools as it
from impute_tools.impute_tools import get_bg_grid

Nreps= 5

ploidy= 2
Nsteps= 40

ncomps= [5]
wind_size_list= np.linspace(10,150,Nsteps,dtype= int)
Nsamps= np.linspace(10,genotype.shape[0]-1,Nsteps,dtype= int)

faclist= [Nsamps,wind_size_list,ncomps]
facCombs= list(it.product(*faclist))

P= 20
background= get_bg_grid(np.array(facCombs), P= P, dimN= len(faclist))


In [342]:
len(facCombs) * 5

8000

In [352]:
from sklearn.metrics import pairwise_distances
from scipy.stats import norm

hap_dists= []
feat_dists= []
norm_t= []

for nan_idx in range(len(nan_coords)):
    
    nan_obs= nan_coords[nan_idx]
    
    for fac_vec in facCombs:
        
        Nsamp= fac_vec[0]
        wind_sizes= fac_vec[1]
        ncomps= fac_vec[2]
        
        hap_stack= []
        feat_stack= []
        control_stack= []
        
        nan_acc= nan_obs[0]
        nan_pos= nan_obs[1]
        local_l= genotype[:,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]

        hap_check= local_l[nan_acc]

        other_obs= [x for x in range(genotype.shape[0]) if x != nan_acc]
        if Nsamp > 0 and Nsamp < len(other_obs):
            other_obs= np.random.choice(other_obs,size= Nsamp,replace= False)

        ###

        nan_wind= genotype[other_obs,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]

        for rep in range(Nreps):
            
            pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
            featl= pca2.fit_transform(local_l)
            hap_pack, feat_pack, control_dists= inv_reproduce(nan_wind,
                                                                nan_acc, 
                                                                featl,
                                                                local_l,
                                                                other_obs,
                                                                metric= metric,
                                                                ploidy= ploidy)

            hap_stack.append(hap_pack)
            feat_stack.append(feat_pack)
            control_stack.append(control_dists)

        hap_stack= [pairwise_distances(X,metric= 'manhattan')[0,1] for X in hap_stack]
        feat_stack= [pairwise_distances(X,metric= 'euclidean')[0,1] for X in feat_stack]
        control_stack= [norm.cdf(feat_stack[x],loc= np.mean(control_stack[x]),scale= np.std(control_stack[x])) for x in range(len(control_stack))]

        hap_dists.append(np.mean(hap_pack))
        feat_dists.append(np.mean(feat_pack))
        norm_t.append(np.mean(control_stack))


In [354]:
figwl= [go.Scatter(
    x= wind_size_list,
    y= norm_t,
    mode= 'markers',
    name= 'feat_dist / featN'
)]


layout= go.Layout()

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [355]:
facfeats= np.array(facCombs)

trans_feats= False

if trans_feats:
    n_comp= 3
    pcafeats = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
    facfeats= pcafeats.fit_transform(facfeats)

z_proc= -np.log(np.array(norm_t))

In [356]:

figwl= [go.Contour(
    x= facfeats[:,0],
    y= facfeats[:,1],
    z= z_proc
)]


layout= go.Layout(
    height= 800,
    width= 800
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)