In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

In [2]:

import allel

def read_vcf_allel(file_vcf,haps_extract= False,calldata= 'calldata/GT'):
    '''
    Use scikit allel to read vcf file. Organise variant information into summary pandas df. 
    '''
    geno1= []

    vcf_ori= allel.read_vcf(file_vcf)

    if not vcf_ori:
        print('file:')
        print(file_vcf)
        print('is empty.')

        return {}, {}, {}

    ### get genotype array
    geno= vcf_ori[calldata]
    
    mult_alt= []
    indel= []
    single= []

    ## Filter SNPs. append to single list what to 
    for idx in range(geno.shape[0]):
        ## eliminate +1 segregating mutations.
        if vcf_ori['variants/ALT'][idx][1]:
            gen_t= geno[idx]
            gen_t[gen_t > 1] = 0
            geno[idx]= gen_t
            ## or just jump them
            indel.append(idx)

        elif len(vcf_ori['variants/REF'][idx]) != 1 or len(vcf_ori['variants/ALT'][idx][0]) != 1:
            indel.append(idx)
        else:
            single.append(idx)

    if haps_extract:
        geno1= geno[:,:,0].T
        geno= geno[:,:,1].T
        geno= np.concatenate((geno,geno1),axis= 0)
    else:
        geno= allel.GenotypeArray(geno)
        geno= geno.to_n_alt().T

    ## setup summary

    column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']

    alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])]
    PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])]

    summary= [
        vcf_ori['variants/CHROM'],
        vcf_ori['variants/POS'],
        vcf_ori['variants/ID'],
        vcf_ori['variants/REF'],
        alts,
        vcf_ori['variants/QUAL'],
        PASS,

    ]

    summary= np.array(summary).T

    if len(indel):
        #
        geno= geno[:,single]
        if len(geno1):
            geno1= geno1[:,single]
        summary= summary[single,:]

    summary= pd.DataFrame(summary,columns= column_names)
    
    return geno, summary, vcf_ori['samples']




## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [15]:
from structure_tools.vcf_geno_tools import simple_read_vcf

vcf_file= 'vcf_data/data_cleanRefs_Alien01_Admx.vcf'
vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Simu_03-03-2019/data.vcf'

genotype, summary, info_save= simple_read_vcf(vcf_file,row_info= 5,header_info= 9,phased= True)
#geno, summary, Names= read_vcf_allel(vcf_file,haps_extract= True) 
print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

Number of markers: 19112
Number of individuals: 200


In [16]:
summary.head()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,1,644,1,A,T,.,PASS,.,GT:AD:DP
1,1,1527,2,A,T,.,PASS,.,GT:AD:DP
2,1,2238,3,A,T,.,PASS,.,GT:AD:DP
3,1,8063,4,A,T,.,PASS,.,GT:AD:DP
4,1,8704,5,A,T,.,PASS,.,GT:AD:DP


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [121]:
from structure_tools.Tutorial_subplots import plot_global_pca

## Perform PCA
n_comp= 3
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(genotype)

In [122]:
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=45)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

idvector= ["i{}_pop{}".format(x,labels1[x]) for x in range(len(labels1))]
###

In [123]:
###
plot_global_pca(feats,label_select,PCA_color_ref,title= 'global_pca',height= 500,width= 950)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [840]:

wind_sizes= 20 # sizes in number of features

wind_prox= 5e5 # window max proximity to position to impute



In [841]:

nan_char= [9] # codes to avoid. 
code_keep= [1,2] # codes to keep. 


In [842]:

### seq_store and dist tools
ncomps= 3 # ncomps to retain in raw data dimensionality reduction.

metric= 'euclidean' # distance calculation in feature space
dimN= ncomps # number of dimensions with which to calculate differences (<= ncomps). 
Nreps= 100 # number of trainning windows to extract (from within window defined by 2x wind_prox)
ind_min= 50 # minimum number of windows

#### Observation clustering tools
comps_dists= 3 # N dimensions for Dr of distance vectors for clustering.

#### Background Grid tools
P= 25 # grid density 
expand=1 # grid scaling, relative to coordinates in raw data Dr

#### KDE LIKELIHOOD TOOLS
dist_comps= 3 # N dimensions for Dr of distance vectors for likelihood inference.
####


In [988]:
## PREP

from impute_tools.genome_adapt import (
    window_parse, lwind_extract, recover_hap
)

from impute_tools.impute_cofactors import (
    bin_keep, code_find, sg_varSel
)

from impute_tools.impute_tools_I import (
    kde_likes_extract, get_likes_engine, 
    window_exam, get_bg_grid
)

from sklearn.metrics import pairwise_distances


#####
##### Functions and tools packages.
### window position select function and arguments
def window_parse(summary, centre= 0, wind_sizes= 20, wind_prox= 1e6):
    '''
    select window based on proximity.
    '''
    obs_pos= int(summary.POS[centre])
    wst= [x for x in range(summary.shape[0]) if abs(int(summary.POS[x]) - obs_pos) <= wind_prox and x != centre]
    wst= [x for x in wst if x < (centre-int(wind_sizes/2)) or x > (centre+int(wind_sizes/2))]
    
    return wst


wparse_func= window_parse
wparse_args= {
    'wind_sizes': wind_sizes,
    'wind_prox': wind_prox
}

### 
### local window processing function and arguments

def new_funkeep(lwind,code_keep= [2], binned= False):
    
    nl= np.zeros(lwind.shape)
    for cd in code_keep:
        tl= lwind == cd
        tl= np.array(tl,dtype= int) * cd
        nl+= tl
    
    nl= np.array(nl,dtype= int)
    
    if binned:
        nl= nl != 0
        nl= np.array(nl,dtype= int)
    
    return nl

lproc_func= bin_keep
lproc_args= {
    'code_keep': code_keep,
    'binned': False
}

process_tools= [
    lproc_func,
    lproc_args
]

###
### local window - which individuals to keep, return boolean
lkeep_func= code_find
lkeep_args= {
    'code_v': nan_char,
    'binned': True,
    'axis': 1
}

keep_tools= [
    lkeep_func,
    lkeep_args
]

###########
#### Feature Var processing tools
varFilt_func= sg_varSel
varFilt_args= {
    'proc': 'none'
}

varFilt_tools= [sg_varSel, varFilt_args]
    

####
#### KDE LIKELIHOOD TOOLS
####
dist_func= kde_likes_extract
dist_args= {
    'dist_comps': dist_comps
}
dist_tools= [dist_func,dist_args]

###
### PCA object for local windows
dr_obj = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
std_diffs= True  

In [1001]:
#### select position to exclude
####
nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)[0]
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)[0]

tf= [ynan,xnan]
#tf= [135, 2661]
#tf= [299, 1194]
#tf= [142, 1335]
###
het_char= genotype==1
tfoc= np.where(het_char==1)
tfoc= np.array(tfoc).T

select_idx= np.random.randint(0,tfoc.shape[0],1)[0]
#tf= tfoc[select_idx]
#tf= [15, 1707]
#tf= [378, 1535]
#tf= [232, 899]
###
#tf= [87, 1893]
#tf=[131, 2813]
#tf= [7, 1618]
#[302, 465]
#tf= [  42, 2340]
#tf= [ 287, 1535]
#[346, 1908]
tf_acc= tf[0]
tf_pos= tf[1]

print(tf)
print(idvector[tf[0]])

[70, 11747]
i70_pop2


In [1002]:
avoid_range= int(wind_sizes / 2) - 1
#avoid_range= 0
###
###
wst= wparse_func(summary,centre= tf_pos,**wparse_args)
print('# pos: {}'.format(len(wst)))

###
### 

dist_store, labelf_select, correct_dist, select_same, std_gp_use= get_likes_engine(genotype, wst, tf, 
                     process_tools, keep_tools, varFilt_tools, dist_tools,comps_dists= comps_dists,
                     wind_sizes= wind_sizes, Nreps= Nreps, ncomps= ncomps, nan_char= nan_char, ind_min= ind_min,
                     dimN= dimN, metric= metric,std_diffs= std_diffs, avoid_range= avoid_range)

print(dist_store.shape)

# pos: 168
(101, 199)



invalid value encountered in true_divide



(100, 199)


In [1003]:
##
## local window - get and process

local_l= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes,mask_pos= [])

## samp keep - samples without het or nan at this window
samp_keep= lkeep_func(local_l, **lkeep_args)
print(sum(samp_keep))

### process local window - convert to haplotypes - keep only code_keep
local_l= lproc_func(local_l,**lproc_args)

##
## local PCA and PCA transform
##
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)

##

200


In [1004]:
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)


In [1005]:
background, like_diet= window_exam(featl, samp_keep, select_same, std_gp_use, dist_store, dist_tools, 
                labelf_select= labelf_select,correct_dist= correct_dist,
               P= P, dimN= ncomps, metric= metric,expand= expand, std_diffs= std_diffs)

4


In [1006]:

tf_rec= recover_hap(background,like_diet,pca_special,
                scale= 1, round_t= True)

tf_proj= pca_special.transform(tf_rec.reshape(1,-1))[0]


In [1007]:
from impute_tools.impute_plots import (
    plot_extracted, plot_compare
)

figwl= plot_extracted(featl, label_select, tf_acc, labels=idvector, plot_out= False)

trace= plot_compare(figwl, background, like_diet, tf_proj)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [1008]:
fig_dual= [trace]

fig_dual.extend(figwl)

layout= go.Layout(
    height= 800,
    width= 900
)
Figure= go.Figure(data= fig_dual,layout= layout)
iplot(Figure)

In [1009]:
tf_rec

array([[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]])

In [1010]:
local_l[tf_acc]

array([1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0])

In [1011]:
ori_wd= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes,mask_pos= [])
ori_wd[tf_acc]

array([1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0])

In [1012]:
print('missing call: {}'.format(genotype[tf_acc,tf_pos]))
print('inferred call: {}'.format(tf_rec[0][int(wind_sizes / 2)]))

missing call: 0
inferred call: 0
