In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA

import pandas as pd

from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [2]:
from structure_tools.vcf_geno_tools import simple_read_vcf
from structure_tools.vcf_geno_tools import read_geno_nanum

vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Extract/vcf/Extract_Chr8_15000.vcf'
#vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Simu_17-03-2019/data.vcf'

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(vcf_file, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

control_subset= 1
clean= 1
subbed= 1

{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [3]:
## read passport information

Input_file= 'D:/Rice/Project_external/metadata/orderCore_INFO.txt'

RG_info= pd.read_csv(Input_file,sep= '\t')

RG_info.head()

Unnamed: 0.1,Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex,code,label
0,0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296,4,aro
1,1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301,4,aro
2,2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303,4,aro
3,3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338,4,aro
4,4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372,4,aro


In [4]:
summary.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,8,14473,242044001,C,T,.,.,PR,GT
1,8,15216,242044744,T,C,.,.,PR,GT
2,8,18535,242048063,G,A,.,.,PR,GT
3,8,19970,242049498,C,T,.,.,PR,GT
4,8,26680,242056208,C,A,.,.,PR,GT


In [5]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.
Names_vcf= list(Names)

for x in range(len(Names_vcf)):
    ind= Names_vcf[x]
    newid= ind.split('_')
    
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names_vcf[x]= newid


In [6]:
### subset core
core_subset= True

coreID_file= 'D:/Rice/Project_external/metadata/Order_core.txt'


with open(coreID_file,'r') as fp:
    coreIDs= fp.readlines()

coreIDs= [x.strip() for x in coreIDs]

core_idx= [x for x in coreIDs if x in Names_vcf]
core_idx= [Names_vcf.index(x) for x in core_idx]

ID_pop= {
    RG_info['ID'][x]: RG_info['Initial_subpop'][x] for x in range(RG_info.shape[0])
}

core_names= [Names_vcf[x] for x in core_idx]
core_pop= [ID_pop[x] for x in core_names]


In [7]:

if control_subset:
    genotype= genotype[core_idx,:]
    control_subset= 0
    

In [8]:
from impute_tools.genome_adapt import (
    clean_geno
)

sub_sel= ['subtrop','trop','temp','aro']
#sub_sel= ['ind1A','temp','aro','aus']

if sub_sel and subbed: 
    sub_idx= [x for x in range(len(core_pop)) if core_pop[x] in sub_sel]
    genotype= genotype[sub_idx]
    core_pop= [core_pop[x] for x in sub_idx]
    core_names= [core_names[x] for x in sub_idx]
    subbed= 0
    ##


if clean:
    keep_pos= clean_geno(genotype, nan_char= 9,
               het_char= 1)
    
    genotype= genotype[:,keep_pos]
    summary= summary.iloc[keep_pos]
    summary= summary.reset_index()
    clean= 0
    



In [9]:
summary.head()

Unnamed: 0,index,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,0,8,14473,242044001,C,T,.,.,PR,GT
1,10,8,34405,242063933,C,T,.,.,PR,GT
2,16,8,37535,242067063,A,C,.,.,PR,GT
3,20,8,46573,242076101,G,A,.,.,PR,GT
4,22,8,61005,242090533,T,G,.,.,PR,GT


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [10]:
## Perform PCA
n_comp= 4
pca_global = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca_global.fit_transform(genotype)

In [11]:
from sklearn.cluster import MeanShift, estimate_bandwidth

from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

idvector= ['i{}_pop{}_{}'.format(x,labels1[x],core_pop[x]) for x in range(len(labels1))]

###
label_pops= {z: [core_pop[x] for x in g] for z,g in label_select.items()}
label_connect= {z: g[np.random.randint(0,len(g))] for z,g in label_pops.items()}
colordict= {z: PCA_color_ref[z] for z in label_pops.keys()}


In [12]:
###
plot_global_pca(feats,label_select,colordict,labels= core_pop,title= 'global_pca',height= 500,width= 950)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [13]:
genotype.shape

(385, 3706)

#########################################    ############################################################################

# Heterozygotes

## part I. Connecting windows

In [14]:
from impute_tools.genome_adapt import (
    window_parse, lwind_extract, recover_hap
)

from impute_tools.impute_cofactors import (
    bin_keep, code_find, sg_varSel
)

wind_sizes= 50
nan_char= [1,9]
code_keep= 2

#
ncomps= 3
##
##
lproc_func= bin_keep
lproc_args= {
    'keep': code_keep
}

##
##
lkeep_func= code_find
lkeep_args= {
    'code_v': nan_char,
    'binned': True,
    'axis': 1
}

dr_obj = PCA(n_components=2, whiten=False,svd_solver='randomized')


In [15]:
#### select position to exclude
####
nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)[0]
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)[0]

tf= [ynan,xnan]
tf= [135, 2661]
#tf= [299, 1194]
tf_acc= tf[0]
tf_pos= tf[1]

tf

[135, 2661]

In [16]:
##
## local window - get and process

nwind= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes,mask_pos= [])

## samp keep - samples without het or nan at this window
samp_keep= lkeep_func(nwind, **lkeep_args)
print(sum(samp_keep))

### process local window - convert to haplotypes - keep only code_keep
local_l= lproc_func(nwind,**lproc_args)

##
## local PCA and PCA transform
##
pca_special= dr_obj.fit(nwind[samp_keep])
featl= pca_special.transform(local_l)

##

350


In [17]:
nwind[2]

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [18]:
nwind[13]

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [19]:
t= [x for x in range(nwind.shape[0]) if nwind[x,21] == 2]
nwind[t[0]]

array([2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [29]:
from sklearn.cluster import MeanShift, estimate_bandwidth

from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(featl, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(featl)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

gp_keys= sorted(label_select.keys())


In [23]:
freq_dict= {z: np.sum(local_l[g],axis= 0) / len(g) for z,g in label_select.items()}
freq_array= [freq_dict[z] for z in gp_keys]
freq_array= np.array(freq_array)

mu= 1e-8

freq_array[freq_array == 0]= mu

In [24]:
## likelihood of a single hapt



In [28]:
from scipy.stats import binom

def hap_lik(hapl,freqs_array,n= 1, cov= True):
    
    zt= binom.pmf(hapl, n, freq_array)
    zt_prod= np.prod(zt, axis= 1)
    
    if np.sum(zt_prod) and cov:
        zt_prod= zt_prod / np.sum(zt_prod)

    return zt_prod


In [26]:
## Test with a monomorph

hap1= label_select[1][0]
hap1= nwind[hap1]
print(hap1)

zt_prod= hap_lik(hap1, freq_array,n= 2)
zt_prod

[2 0 0 0 2 0 0 0 0 0 2 0 2 2 0 0 0 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


array([1.56162717e-112, 1.00000000e+000, 1.45475427e-159, 0.00000000e+000,
       4.95039184e-053, 0.00000000e+000])

In [27]:
## test with an admix

In [28]:
hap1= nwind[13]


zt_prod= hap_lik(hap1, freq_array,n= 2)
zt_prod

array([0., 0., 1., 0., 0., 0.])

In [29]:
nwind[13]

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [30]:
## text with all admix:

hapmix= nwind[~samp_keep]
zt_prod=[hap_lik(x, freq_array,n= 2,cov= False) for x in hapmix]
zt_prod= np.array(zt_prod)
zt_prod

array([[0.00000000e+000, 0.00000000e+000, 8.35804094e-022,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.93403164e-009,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.96675728e-010,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.48701650e-016,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.16776212e-002,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 2.85372651e-025,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.16776212e-002,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.16776212e-002,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 1.4870

In [31]:

filt= np.max(zt_prod,axis= 1)
filt

array([8.35804094e-22, 1.93403164e-09, 1.96675728e-10, 1.48701650e-16,
       1.16776212e-02, 2.85372651e-25, 1.16776212e-02, 1.16776212e-02,
       1.48701650e-16, 7.05114719e-04, 7.94013881e-05, 4.57179149e-04,
       6.92763333e-12, 3.00467459e-37, 3.39466861e-23, 3.79378105e-39,
       7.86811996e-60, 2.85372651e-25, 4.11069116e-57, 1.89674771e-02,
       1.66121610e-05, 7.51112100e-09, 1.26060773e-16, 2.64463161e-17,
       2.55233087e-01, 4.55883476e-01, 6.51262108e-03, 1.50222421e-16,
       4.43368560e-58, 1.93403164e-09, 1.75946088e-05, 3.71837669e-03,
       7.47375216e-03, 3.71837669e-03, 3.71837669e-03])

In [32]:
zt= [x for x in range(nwind.shape[0]) if not samp_keep[x]]
np.argmax(zt_prod,axis= 1)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 4, 0, 2, 2, 2, 0, 1, 0,
       0, 0, 1, 3, 3, 0, 2, 2, 0, 0, 0, 0, 0], dtype=int32)

In [33]:
nwind[zt[0]]

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       2, 0, 2, 0, 0, 0])

In [34]:
## text with all admix:
ref_likes={
    z: [hap_lik(nwind[x], freq_array,n= 2,cov= False) for x in label_select[z]] for z in label_select.keys()
}
ref_likes= {z: np.array(g) for z,g in ref_likes.items()}

ref_likes[0]

array([[1.51735985e-04, 2.36885746e-40, 6.01873947e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [1.51735985e-04, 2.36885746e-40, 6.01873947e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.75556046e-01, 2.36885742e-24, 3.46381677e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       ...,
       [3.75556046e-01, 2.36885742e-24, 3.46381677e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.75556046e-01, 2.36885742e-24, 3.46381677e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.75556046e-01, 2.36885742e-24, 3.46381677e-36, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])

In [36]:
ref_likes[2]

array([[0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 5.65939662e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.83378628e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.35804094e-22, 0.00000000e+00,
        0.00000000e+00, 

In [None]:
nwind[label_select[6],:]

In [None]:
label_select[6]

In [None]:
hap_lik(nwind[296], freq_array,n= 2,cov= False)

In [None]:
t= binom.pmf(nwind[296],2,freq_array)
np.prod(t,axis= 1)
#np.prod(t)

In [None]:
hapmix

In [20]:
het_char= genotype==1
tfoc= np.where(het_char==1)
tfoc= np.array(tfoc).T

select_idx= np.random.randint(0,tfoc.shape[0],1)[0]
tf= tfoc[select_idx]

tfoc

array([[   0,   98],
       [   0,  420],
       [   0,  539],
       ...,
       [ 383, 3110],
       [ 383, 3436],
       [ 384,  951]], dtype=int32)

In [30]:


def get_hetWind(genotype, tf,wmax= 20, hom= 2):
    tf_pos= tf[1]
    tf_acc= tf[0]
    print(genotype[tf_acc,tf_pos])
    dl= int(tf_pos) - 1
    du= int(tf_pos) + 1
    
    pos= [tf_pos]
    for idx in range(wmax):
        if genotype[tf_acc,dl] != hom:
            pos.append(dl)
            dl -= 1
        if genotype[tf_acc,du] != hom:
            pos.append(du)
            du += 1
    
    return sorted(pos)

tf_acc= tf[0]
tf_pos= tf[1]

wind_idx= get_hetWind(genotype, tf,wmax= 10, hom= 2)
nwind= genotype[:,wind_idx]
nwind.shape

1


(385, 21)

In [31]:
nwind[tf_acc]

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [61]:

## samp keep - samples without het or nan at this window
samp_keep= lkeep_func(nwind, **lkeep_args)
print(sum(samp_keep))

### process local window - convert to haplotypes - keep only code_keep
local_l= lproc_func(nwind,**lproc_args)

##
## local PCA and PCA transform
##
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)

##

363


In [62]:
fig= [go.Scatter(
    x= featl[g,0],
    y= featl[g,1],
    text= [''.join(np.array(nwind[x],dtype= str)) for x in g],
    mode= 'markers',
    name= str(i)
) for i,g in label_select.items()]

figm= [go.Scatter(
    x= [np.median(featl,axis= 0)[0]],
    y= [np.median(featl,axis= 0)[1]],
    mode= 'markers',
    name= 'centre'
)]

#fig.extend(figm)

layout= go.Layout()

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)


In [33]:
from sklearn.cluster import MeanShift, estimate_bandwidth

from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(featl, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(featl)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

gp_keys= sorted(label_select.keys())


In [34]:
freq_dict= {z: np.sum(local_l[g],axis= 0) / len(g) for z,g in label_select.items()}
freq_array= [freq_dict[z] for z in gp_keys]
freq_array= np.array(freq_array)

mu= 1e-8

freq_array[freq_array == 0]= mu
freq_array[freq_array == 1]= 1-mu
#freq_array

In [40]:
label_select.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [None]:
t= binom.pmf(sep_hap[1],1,freq_array)
t.T

In [38]:
hap1= nwind[tf_acc]


zt_prod= hap_lik(hap1, freq_array,n= 2,cov= False)
zt_prod

array([1.41132169e-047, 1.57322186e-095, 6.29048915e-035, 1.19228552e-024,
       8.29439822e-070, 1.23456755e-050, 1.59999944e-047, 1.59999944e-047,
       1.59999946e-047, 1.59999958e-095, 1.59999954e-079, 1.59999949e-063,
       1.59999968e-127, 1.59999941e-031])

In [39]:
def hap_unsorted(hap1,ploidy= 2):
    tar= np.zeros((ploidy, len(hap1)))
    for idx in range(len(hap1)):
        ct= hap1[idx]
        tar[:ct,idx]= 1
    
    return tar

sep_hap= hap_unsorted(hap1,ploidy= 2)
sep_hap

array([[0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]])

In [None]:
nwind[label_select[0][3]]

In [None]:
genotype[tf_acc,tf_pos-10:tf_pos+10]

In [None]:
genotype[328,tf_pos-10:tf_pos+10]

In [None]:
np.sum(nwind,axis= 1)