In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA

import pandas as pd

from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [2]:
from structure_tools.vcf_geno_tools import simple_read_vcf
from structure_tools.vcf_geno_tools import read_geno_nanum

vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Extract/vcf/Extract_Chr8_15000.vcf'
#vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Simu_17-03-2019/data.vcf'

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(vcf_file, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

control_subset= 1
clean= 1
subbed= 1

{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [3]:
## read passport information

Input_file= 'D:/Rice/Project_external/metadata/orderCore_INFO.txt'

RG_info= pd.read_csv(Input_file,sep= '\t')

RG_info.head()

Unnamed: 0.1,Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex,code,label
0,0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296,4,aro
1,1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301,4,aro
2,2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303,4,aro
3,3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338,4,aro
4,4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372,4,aro


In [4]:
summary.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,8,14473,242044001,C,T,.,.,PR,GT
1,8,15216,242044744,T,C,.,.,PR,GT
2,8,18535,242048063,G,A,.,.,PR,GT
3,8,19970,242049498,C,T,.,.,PR,GT
4,8,26680,242056208,C,A,.,.,PR,GT


In [5]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.
Names_vcf= list(Names)

for x in range(len(Names_vcf)):
    ind= Names_vcf[x]
    newid= ind.split('_')
    
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names_vcf[x]= newid


In [6]:
### subset core
core_subset= True

coreID_file= 'D:/Rice/Project_external/metadata/Order_core.txt'


with open(coreID_file,'r') as fp:
    coreIDs= fp.readlines()

coreIDs= [x.strip() for x in coreIDs]

core_idx= [x for x in coreIDs if x in Names_vcf]
core_idx= [Names_vcf.index(x) for x in core_idx]

ID_pop= {
    RG_info['ID'][x]: RG_info['Initial_subpop'][x] for x in range(RG_info.shape[0])
}

core_names= [Names_vcf[x] for x in core_idx]
core_pop= [ID_pop[x] for x in core_names]


In [7]:

if control_subset:
    genotype= genotype[core_idx,:]
    control_subset= 0
    

In [8]:
from impute_tools.genome_adapt import (
    clean_geno
)

sub_sel= ['subtrop','trop','temp','aro']
#sub_sel= ['ind1A','temp','aro','aus']

if sub_sel and subbed: 
    sub_idx= [x for x in range(len(core_pop)) if core_pop[x] in sub_sel]
    genotype= genotype[sub_idx]
    core_pop= [core_pop[x] for x in sub_idx]
    core_names= [core_names[x] for x in sub_idx]
    subbed= 0
    ##


if clean:
    keep_pos= clean_geno(genotype, nan_char= 9,
               het_char= 1)
    
    genotype= genotype[:,keep_pos]
    summary= summary.iloc[keep_pos]
    summary= summary.reset_index()
    clean= 0
    



In [9]:
summary.head()

Unnamed: 0,index,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,0,8,14473,242044001,C,T,.,.,PR,GT
1,10,8,34405,242063933,C,T,.,.,PR,GT
2,16,8,37535,242067063,A,C,.,.,PR,GT
3,20,8,46573,242076101,G,A,.,.,PR,GT
4,22,8,61005,242090533,T,G,.,.,PR,GT


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [10]:
## Perform PCA
n_comp= 4
pca_global = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca_global.fit_transform(genotype)

In [11]:
from sklearn.cluster import MeanShift, estimate_bandwidth

from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

idvector= ['i{}_pop{}_{}'.format(x,labels1[x],core_pop[x]) for x in range(len(labels1))]

###
label_pops= {z: [core_pop[x] for x in g] for z,g in label_select.items()}
label_connect= {z: g[np.random.randint(0,len(g))] for z,g in label_pops.items()}
colordict= {z: PCA_color_ref[z] for z in label_pops.keys()}


In [12]:
###
plot_global_pca(feats,label_select,colordict,labels= core_pop,title= 'global_pca',height= 500,width= 950)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [13]:
genotype.shape

(385, 3706)

#########################################    ############################################################################

# Imputation 

## I. Parameters and prep.

>i. Biologically significant parameters

In [34]:

wind_sizes= 20 # sizes in number of features

wind_prox= 1e6 # window max proximity to position to impute



>ii. Data specific parameters

In [35]:

nan_char= [9] # codes to avoid. 
code_keep= [2] # codes to keep. 


>iii. Measurement parameters

In [36]:

### seq_store and dist tools
ncomps= 3 # ncomps to retain in raw data dimensionality reduction.

metric= 'euclidean' # distance calculation in feature space
dimN= ncomps # number of dimensions with which to calculate differences (<= ncomps). 
Nreps= 150 # number of trainning windows to extract (from within window defined by 2x wind_prox)
ind_min= 50 # minimum number of windows

#### Observation clustering tools
comps_dists= 5 # N dimensions for Dr of distance vectors for clustering.

#### Background Grid tools
P= 20 # grid density 
expand=1 # grid scaling, relative to coordinates in raw data Dr

#### KDE LIKELIHOOD TOOLS
dist_comps= 10 # N dimensions for Dr of distance vectors for likelihood inference.
####


### Function calls and prep.

Attempt to make this a functional approach as much as possible. 

Window selection, processing, variance filtering and likelihood extraction methods are passed to the main function.

See respective `tools` scripts to view expected inputs and outputs . 

In [37]:
## PREP

from impute_tools.genome_adapt import (
    window_parse, lwind_extract, recover_hap
)

from impute_tools.impute_cofactors import (
    bin_keep, code_find, sg_varSel
)

from impute_tools.impute_tools import (
    kde_likes_extract, get_likes_engine, 
    window_exam, get_bg_grid
)

from sklearn.metrics import pairwise_distances


#####
##### Functions and tools packages.
### window position select function and arguments
wparse_func= window_parse
wparse_args= {
    'wind_sizes': wind_sizes,
    'wind_prox': wind_prox
}

### 
### local window processing function and arguments
lproc_func= bin_keep
lproc_args= {
    'keep': code_keep
}

process_tools= [
    lproc_func,
    lproc_args
]

###
### local window - which individuals to keep, return boolean
lkeep_func= code_find
lkeep_args= {
    'code_v': nan_char,
    'binned': True,
    'axis': 1
}

keep_tools= [
    lkeep_func,
    lkeep_args
]

###########
#### Feature Var processing tools
varFilt_func= sg_varSel
varFilt_args= {
    'proc': 'none'
}

varFilt_tools= [sg_varSel, varFilt_args]


####
#### KDE LIKELIHOOD TOOLS
####
dist_func= kde_likes_extract
dist_args= {
    'dist_comps': dist_comps
}
dist_tools= [dist_func,dist_args]

###
### PCA object for local windows
dr_obj = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')


## II. Window selection

### A. Select a random coordinate to impute (observation, feature index).

In [38]:
#### select position to exclude
####
nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)[0]
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)[0]

tf= [ynan,xnan]
#tf= [135, 2661]
#tf= [299, 1194]
#tf= [142, 1335]
###
het_char= genotype==1
tfoc= np.where(het_char==1)
tfoc= np.array(tfoc).T

select_idx= np.random.randint(0,tfoc.shape[0],1)[0]
tf= tfoc[select_idx]

###
tf_acc= tf[0]
tf_pos= tf[1]

tf

array([ 47, 783], dtype=int32)

### B. Trainning set extraction and distance calculation.

In [39]:

###
###
wst= wparse_func(summary,centre= tf_pos,**wparse_args)
print('# pos: {}'.format(len(wst)))

###
### 

dist_store, labelf_select, correct_dist, select_same, std_gp_use= get_likes_engine(genotype, wst, tf, 
                     process_tools, keep_tools, varFilt_tools, dist_tools,comps_dists= comps_dists,
                     wind_sizes= wind_sizes, Nreps= Nreps, ncomps= ncomps, nan_char= nan_char, ind_min= ind_min,
                     dimN= dimN, metric= metric)

print(dist_store.shape)

# pos: 254
(151, 384)



invalid value encountered in true_divide



(150, 384)


### C. Local window, extraction and Dr.

In [40]:
##
## local window - get and process

local_l= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes,mask_pos= [])

## samp keep - samples without het or nan at this window
samp_keep= lkeep_func(local_l, **lkeep_args)
print(sum(samp_keep))

### process local window - convert to haplotypes - keep only code_keep
local_l= lproc_func(local_l,**lproc_args)

##
## local PCA and PCA transform
##
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)

##

385


In [41]:
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)


### D. Coordinate likelihood and haplotype inference.

>i. Background grid likelihood by position.

In [42]:
background, like_diet= window_exam(featl, samp_keep, select_same, std_gp_use, dist_store, dist_tools, 
                labelf_select= labelf_select,correct_dist= correct_dist,
               P= P, dimN= ncomps, metric= metric,expand= expand)

4


In [43]:
background.shape

(8000, 3)

>ii. recover haplotype

using inverse transform here.

In [44]:

tf_rec= recover_hap(background,like_diet,pca_special,
                scale= 1, round_t= True)

tf_proj= pca_special.transform(tf_rec.reshape(1,-1))[0]


In [45]:
from impute_tools.impute_plots import (
    plot_extracted, plot_compare
)

figwl= plot_extracted(featl, label_select, tf_acc, labels=idvector, plot_out= False)

trace= plot_compare(figwl, background, like_diet, tf_proj)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [46]:
fig_dual= [trace]

fig_dual.extend(figwl)

layout= go.Layout(
    height= 800,
    width= 900
)
Figure= go.Figure(data= fig_dual,layout= layout)
iplot(Figure)

## Result

Our predicted haplotype

In [47]:
tf_rec

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1]])

The original haplotype, only homozygous calls kept:

In [48]:
local_l[tf_acc]

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1])

The original genotype, hom and het calls at this window:

In [49]:
ori_wd= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes,mask_pos= [])
ori_wd[tf_acc]

array([0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 2])

In [50]:
print('missing call: {}'.format(genotype[tf_acc,tf_pos]))
print('inferred call: {}'.format(tf_rec[0][int(wind_sizes / 2)]))

missing call: 1
inferred call: 0


## Discussion

to build benchmark script to get an idea of the rate of accuracy and associated variables. 

> Forward

Study applications for phasing data:

**context**

- current script allows to ignore data with given codes (e.g. 9=nan, 1= het). And transformation is performed using only desired codes (here 2,0= homs) as presence absence.

- The current example (above), does not remove samples carrying heterozygous calls, although it reduces only on homozyous calls. 

**observation** 

- running imputation on heterozyous calls sometimes produces a reasonable prediction of one of the phases. Probably corresponding in this case to that coherent with homozygous haplotypes in the vicinity. 

- benchmark yet to be done. 


## Continuation

MS select grid refinement to be incorporated. 

Currently this method does not perform the distance profile parse and composite likelihood extraction that the above method does.

The limitation can be seen below.

In [443]:
from impute_tools.impute_tools import (
    nBg_MS, nBg_grid,
    gridWalk
)

dist_ref_select= 0
dist_ref= dist_store[labelf_select[dist_ref_select],:]
dist_ref= dist_store[:,std_gp_use]

P= 20
dimN= 3
N_samps= P**dimN
dist_comps= 10
Bandwidth_split = 30
kernel= 'gaussian'
metric= 'euclidean'


BG_func= nBg_MS
BG_args= {
    'lb':0.05,
    'up':0.6,
    'kernel': kernel,
    'N_samps': 5
}


granted, grid_likes= gridWalk(featl,dist_ref,BG_func, BG_args= BG_args, std_gp_use= std_gp_use,
            P= P,
            dimN= dimN,
            N_samps= N_samps,
            dist_comps= dist_comps,
            Bandwidth_split = Bandwidth_split,
            metric= metric,
            kernel= kernel,
            min_samp= 10)



In [444]:
dist_ref.shape

(200, 218)

In [445]:
from plotly import tools

title= 'coords'
fig_subplots = tools.make_subplots(rows=1, cols=2,subplot_titles=tuple([title]*2))

for trace in figwl:
    fig_subplots.append_trace(trace, 1, 1)
    

trace= go.Scatter(
    x= granted[:,0],
    y= granted[:,1],
    #z= grid_likes,
    mode= 'markers',
    marker= {
        'color':grid_likes,
        'colorbar': go.scatter.marker.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 8,
        'symbol': 'circle',
      "opacity": 1
      }
)

fig_subplots.append_trace(trace, 1,2)

iplot(fig_subplots)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

