In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [2]:
from structure_tools.vcf_geno_tools import simple_read_vcf
from structure_tools.vcf_geno_tools import read_geno_nanum

vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Extract/vcf/Extract_Chr8_15000.vcf'
#vcf_file= 'D:/GitHub/Tools_and_toys/VCF_analysis/Simu_17-03-2019/data.vcf'

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(vcf_file, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))
control_subset= 0

{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [3]:
## read passport information

Input_file= 'D:/Rice/Project_external/metadata/orderCore_INFO.txt'

RG_info= pd.read_csv(Input_file,sep= '\t')

RG_info.head()

Unnamed: 0.1,Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex,code,label
0,0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296,4,aro
1,1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301,4,aro
2,2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303,4,aro
3,3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338,4,aro
4,4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372,4,aro


In [4]:
list(set(RG_info['Initial_subpop']))

['ind1B',
 'ind1A',
 'subtrop',
 'aus',
 'temp',
 'admix',
 'trop',
 'ind3',
 'aro',
 'ind2',
 'japx',
 'indx']

In [5]:
summary.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,8,14473,242044001,C,T,.,.,PR,GT
1,8,15216,242044744,T,C,.,.,PR,GT
2,8,18535,242048063,G,A,.,.,PR,GT
3,8,19970,242049498,C,T,.,.,PR,GT
4,8,26680,242056208,C,A,.,.,PR,GT


In [6]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.
Names_vcf= list(Names)

for x in range(len(Names_vcf)):
    ind= Names_vcf[x]
    newid= ind.split('_')
    
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names_vcf[x]= newid


In [7]:
### subset core
coreID_file= 'D:/Rice/Project_external/metadata/Order_core.txt'
with open(coreID_file,'r') as fp:
    coreIDs= fp.readlines()

coreIDs= [x.strip() for x in coreIDs]

core_idx= [x for x in coreIDs if x in Names_vcf]
core_idx= [Names_vcf.index(x) for x in core_idx]

ID_pop= {
    RG_info['ID'][x]: RG_info['Initial_subpop'][x] for x in range(RG_info.shape[0])
}

core_names= [Names_vcf[x] for x in core_idx]
core_pop= [ID_pop[x] for x in core_names]


In [8]:

if control_subset == 0:
    genotype= genotype[core_idx,:]
    control_subset+= 1
    

In [8]:

sub_sel= ['subtrop','trop','temp','aro']
#sub_sel= ['ind1A','temp','aro','aus']

if sub_sel: 
    sub_idx= [x for x in range(len(core_pop)) if core_pop[x] in sub_sel]
    genotype= genotype[sub_idx]
    core_pop= [core_pop[x] for x in sub_idx]
    core_names= [core_names[x] for x in sub_idx]
    
    ##
    gen_add= np.sum(genotype,axis= 0)
    gen_sum= gen_add != 0
    
    #
    nan_char= 9
    het_char= 1

    nan_geno= genotype == nan_char
    nan_geno= np.array(nan_geno,dtype= int)
    #
    het_char= genotype == het_char
    het_char= np.array(het_char,dtype= int)
    #
    arch= genotype > 0
    arch= np.array(arch,dtype= int)
    arch= np.sum(arch,axis= 0)
    
    nan_row= np.sum(nan_geno,axis= 0)
    het_row= np.sum(het_char,axis= 0)
    
    clob_sum= nan_row + het_row
    
    remove_clob= [x for x in range(genotype.shape[1]) if gen_sum[x] and arch[x] != nan_row[x] and arch[x] != het_row[x] and arch[x] != clob_sum[x]]
    
    genotype= genotype[:,remove_clob]
    summary= summary.iloc[remove_clob]
    summary= summary.reset_index()
    
    
    



In [9]:
summary.head()

Unnamed: 0,index,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,0,8,14473,242044001,C,T,.,.,PR,GT
1,1,8,15216,242044744,T,C,.,.,PR,GT
2,6,8,27705,242057233,T,C,.,.,PR,GT
3,9,8,33963,242063491,G,T,.,.,PR,GT
4,10,8,34405,242063933,C,T,.,.,PR,GT


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [10]:
genotype.shape

(385, 4491)

In [11]:
## Perform PCA
n_comp= 4
pca_global = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca_global.fit_transform(genotype)

In [12]:
from structure_tools.Tutorial_subplots import plot_global_pca
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

###
label_pops= {z: [core_pop[x] for x in g] for z,g in label_select.items()}
label_connect= {z: g[np.random.randint(0,len(g))] for z,g in label_pops.items()}
colordict= {z: PCA_color_ref[z] for z in label_pops.keys()}


In [13]:
###
plot_global_pca(feats,label_select,colordict,labels= core_pop,title= 'global_pca',height= 500,width= 950)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



#########################################
## NaNs and Hets

In [14]:
genotype.shape

(385, 4491)

In [21]:

def code_find(nwind,code_v= [1,9],binned= True,axis= 1):
    '''
    return obs or feats carrying codes in code_v vector.
    - returns binary array, value 1 indicating absence of code_v codes.
    '''
    code_check= np.zeros(nwind.shape)

    for cn in code_v:
        cd_m= nwind == cn
        cd_m= np.array(cd_m,dtype= int)
        code_check+= cd_m

    code_check= np.sum(code_check,axis= axis)

    if binned:
        code_check= code_check == 0
    
    return code_check


def window_parse(summary, centre= 0, wind_sizes= 20, wind_prox= 1e6):
    '''
    select window based on proximity.
    '''
    obs_pos= int(summary.POS[centre])
    wst= [x for x in range(summary.shape[0]) if abs(int(summary.POS[x]) - obs_pos) <= wind_prox]
    wst= [x for x in wst if x < (centre-int(wind_sizes)) or x > (centre+int(wind_sizes)/2)]
    
    return wst

def lwind_extract(genotype, idx= 50, wind_sizes= 50):
    '''
    '''
    lwind= genotype[:,(idx-int(wind_sizes/2)):(idx+int(wind_sizes/2))]
    
    return lwind


def bin_keep(lwind, keep= 2):
    """
    turn data into binary array of presence of chosen code.
    """
    
    lwind= lwind == keep
    lwind= np.array(lwind,dtype= int)
    
    return lwind


def sg_varSel(dist_var,proc= 'cluster',stt= 2, min_ind= 10):
    '''
    filter observations by variance. methods:
    - cluster: return cluster of observations with smallest mean.
    - standard: observations with inlier std value.
    - none: return full list.
    '''
    
    if proc == 'cluster':
        
        bandwidth = estimate_bandwidth(dist_var.reshape(-1,1), quantile=0.2)

        ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=35)

        ms.fit(dist_var.reshape(-1,1))
        labels_std = ms.labels_
        std_select = {y:[x for x in range(len(labels_std)) if labels_std[x] == y] for y in sorted(list(set(labels_std)))}
        
        std_gpmeans= {z: np.std([dist_var[x] for x in g]) for z,g in std_select.items() if z != -1}

        std_gp_use= sorted(std_gpmeans,key= std_gpmeans.get)
        d= 0
        idx = 0

        while d != 1:
            g=std_select[std_gp_use[idx]] 

            if len(g) >= min_ind:
                std_gp_use= list(g)
                d= 1

            idx+= 1
        
        return std_gp_use

        
    elif proc == 'standard':
        std_gp_use= [x for x in range(len(dist_var)) if (dist_var[x] - np.mean(dist_var)) / np.std(dist_var) < stt]
        return std_gp_use
        
    else:
        std_gp_use= list(range(len(dist_var)))

        return std_gp_use

    

def expand_grid(grid_array, expand= 1):
    
    centre= np.mean(grid_array,axis= 0)
    grid_array= np.array(grid_array) - centre 
    grid_array= grid_array * expand
    grid_array= grid_array + centre
    
    return grid_array


def index_convert2L(fsamp_keep, idx_l1, idx_l2, 
                    min_miss= 5):
    '''
    subselect two layers if indicies to use are positive in fsamp_keep boolean vector.
    '''
    sample_select= [idx_l1[x] for x in idx_l2]
    samp_exc= fsamp_keep[sample_select]
    
    select_miss= list(range(len(sample_select)))
    if len(samp_exc) - sum(samp_exc) > min_miss:
        select_miss= [x for x in range(len(sample_select)) if samp_exc[x]]
    
    sample_select= [sample_select[x] for x in select_miss]
    idx_l2= [idx_l2[x] for x in select_miss]
    
    return sample_select, idx_l2



def recover_hap(background,like_diet,pca_spec,
                scale= 1, round_t= True):
    '''
    recover haplotype using pca inverse transform,  
    '''
    tf_max= background[np.argmax(like_diet)]
    
    tf_rec= pca_spec.inverse_transform(tf_max.reshape(1,-1))

    if round_t:
        tf_rec[tf_rec <= 0] = 0
        tf_rec= np.round(tf_rec) * scale
        tf_rec= np.array(tf_rec,dtype= int)
    
    return tf_rec




In [25]:
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

###
###


###
###
from sklearn.metrics import pairwise_distances


def target_wdDist(genotype, avail_coords= [],
                  nan_obs= [0,0],
               wind_sizes= 100,
                Nrep= 400,
                ncomps= 5,
                 nan_char= [1,9],
                 ind_min= 50,
                 int_check= 50):
    '''
    Given avail_coords list of possible features, 
    Parse every possible window for observations bearing codes to avoid using the function `code_check`.
    find the group of windows that maximizes the number of observations without any code to avoid and group size.
    method:
    - Use binary profiles of windows, indicating the absence of codes to avoid by observation.
    - Use PCA and meanshift clustering to find similar patterns of missingness, select group of windows minimizing missingness.
    '''
    pca = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
    
    ###
    if not avail_coords:
        avail_coords= list(range(genotype.shape[1]))
    
    nan_acc= nan_obs[0]
    nan_pos= nan_obs[1]
    other_obs= [x for x in range(genotype.shape[0]) if x != nan_acc]
    
    ###
    nan_array= []
    Seq_store= {}
    dist_store= []
    already_visited= [0] * len(avail_coords)
    
    d= 0
    trail= []
    
    while len(Seq_store) < Nrep:
        
        stp_idx= np.random.randint(0,len(avail_coords),1)[0]
        stp= avail_coords[stp_idx]

        nwind= lwind_extract(genotype, idx= stp, wind_sizes= wind_sizes)
        
        code_check= code_find(nwind,code_v= nan_char,binned= True,axis= 1)
        code_check= np.array(code_check,dtype= int)
        
        if not code_check[nan_acc]:
            continue
        
        code_check=[code_check[x] for x in other_obs]
        nan_array.append(code_check)
        trail.append(stp)
        
        if d >= Nrep:
            pres_dat= np.array(nan_array)
            feats= pca.fit_transform(pres_dat)
            print(pres_dat.shape)
            bandwidth = estimate_bandwidth(feats, quantile=0.2)
            
            if bandwidth == 0:
                labels1= [0]*feats.shape[0]
            else:
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=25)
                ms.fit(feats)
                labels1 = ms.labels_
            
            label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}
            label_sizes= {z:len(g) for z,g in label_select.items()}
            lab_max= sorted(label_sizes,key= label_sizes.get,reverse= True)[0]
            ##
            ##
            if label_sizes[lab_max] >= Nrep:
                
                lab_chose= label_select[lab_max]
                pres_dat= pres_dat[lab_chose]
                
                pres_chose= np.sum(pres_dat,axis= 1)
                #
                pres_range= range(min(pres_chose),max(pres_chose)+1)
                pres_cdf= [sum(pres_chose >= x) for x in pres_range]                
                
                for idx in range(len(pres_cdf))[::-1]:
                    if pres_range[idx] > ind_min:
                        if pres_cdf[idx] >= Nrep:
                            pres_select= pres_chose >= pres_range[idx]
                            #
                            select_same= pres_dat[pres_select]
                            select_same= np.sum(select_same,axis= 0)
                            #
                            select_same= select_same == max(select_same)
                            select_same= np.array(other_obs)[select_same]
                            #
                            pres_select= np.array(lab_chose)[pres_select]
                            keep_select= np.random.choice(pres_select,Nrep,replace= False)
                            
                            Seq_store= [trail[x] for x in  keep_select]
            
            d-= int_check
        
        
        d+= 1
        
        already_visited[stp_idx]= 1
        if sum(already_visited) == len(avail_coords):
            print(d)
            #print(d,already_visited)
            return [], Seq_store
    
    return select_same, Seq_store
    



In [26]:

def store_differences(genotype, Seq_store, select_same, dr_obj, 
                        dimN= 2, wind_sizes= 50, nan_acc= 0,
                        nan_char= [1,9],
                        metric= 'euclidean',
                        process_tools= {},
                        keep_tools= {}):
    """
    Use vector of array positions to extract windows;
    Calculate distances between between target observation and select_same distances at each window.
    exclude observations at each windows
    """
    
    dist_store= []
    for stp in Seq_store:
        
        if stp < wind_sizes/2:
            continue
        
        nwind= lwind_extract(genotype,idx= stp, wind_sizes= wind_sizes)
        
        ## local keep 
        keep= keep_tools[0](nwind, **keep_tools[1])
        #print(keep)
        
        ### process local window
        nwind= process_tools[0](nwind,**process_tools[1])
        
        if sum(keep) <= 5:
            print('ho')
            continue
        
        if not keep[nan_acc]:
            print('hi')
            continue
        #print(nwind.shape)
        #print(np.sum(nwind,axis= 0))
        
        pcah= dr_obj.fit(nwind[keep])
        featw= pcah.transform(nwind)

        obsn= featw[nan_acc,:dimN].reshape(1,-1)
        dist_vec= pairwise_distances(obsn, featw[select_same,:dimN],
                                                    metric=metric)

        dist_store.extend(dist_vec)
    
    dist_store= np.array(dist_store)
    
    return dist_store





def grid_likelihood(dist_grid,dist_store,dist_tools,labelf_select= {},std_gp_use= [],
                   correct_dist= {}, ncomps= 4):
    '''
    classify distance vectors in dist_grid using reference dlik_func. Dlik_func proocesses dist_grid observations.
    refereence observations are extracted from the dist_store array and can be subsetted.
    optionally, use std_gp_use to use only a subset of features.
    - likelihoods are weighed for the number of observations in each reference group. 
    - likelihoods are corrected for the inverse of their distance.
    '''
    
    ###
    if not std_gp_use:
        std_gp_use= list(range(dist_store.shape[1]))
    
    if not labelf_select:
        labelf_select= {0: list(range(dist_store.shape[0]))}
    
    if not correct_dist:
        correct_dist= {z: 1 for z in labelf_select}
    ###
    print(ncomps)
    pca_dists = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
    pca_dists= pca_dists.fit(dist_store[:,std_gp_use])
    
    likes_array= []
    supp_prop= []
    dist_prop= []
    
    ##    
    for dist_ref_select,g in labelf_select.items():
        if dist_ref_select == -1:
            continue
        
        if len(g) > 5:
            dist_ref= dist_store[g,:]
            dist_ref= dist_ref[:,std_gp_use]
            
            grid_likes= dist_tools[0](dist_grid,dist_ref,pca_obj= pca_dists,**dist_tools[1])
            
            ###
            likes_array.append(grid_likes)
            supp_prop.append(len(g))
            
            dist_prop.append(min(correct_dist[dist_ref_select]))
    
    
    supp_prop= np.array(supp_prop) / sum(supp_prop)
    dist_prop= 1 / np.array(dist_prop)
    dist_prop= dist_prop / sum(dist_prop)

    likes_array= np.array(likes_array)
    likes_array= likes_array / np.nansum(likes_array,axis= 1).reshape(-1,1)

    likes_array= likes_array * supp_prop.reshape(-1,1)
    likes_array= likes_array * dist_prop.reshape(-1,1)
    #
    like_diet= np.nansum(likes_array,axis= 0) 
    #
    return like_diet





In [33]:


#######################################################
#######################################################

def get_likes_engine(genotype, wst, tf, 
                     process_tools, keep_tools, varFilt_tools, dist_tools,
                     wind_sizes= 50, Nreps= 100, ncomps= 3, nan_char= [1,9], ind_min= 50,
                     dimN= 3, metric= 'euclidean'):
    '''
    '''
    tf_acc= tf[0]
    tf_pos= tf[1]
    dr_obj = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
    
    ### get array of similar windows
    select_same, Seq_store= target_wdDist(genotype, avail_coords= wst,
                      nan_obs= tf,
                       wind_sizes= wind_sizes,
                        Nrep= Nreps,
                        ncomps= ncomps,
                         nan_char= nan_char,
                         ind_min= ind_min)


    ### get distances array from extracted windows windows
    dist_store= store_differences(genotype, Seq_store, select_same, dr_obj, 
                            dimN= dimN, wind_sizes= wind_sizes, nan_acc= tf_acc,
                            nan_char= nan_char,
                            metric= metric,
                            process_tools= process_tools,
                            keep_tools= keep_tools)


    ### Variance in distances across reference windows
    dist_var= np.std(dist_store,axis= 0)**2

    # subselect features by variance in distances:
    std_gp_use= varFilt_tools[0](dist_var,**varFilt_tools[1])

    ### Clustering windows
    ###
    pca_cl = PCA(n_components=comps_dists, whiten=False,svd_solver='randomized')
    featd= pca_cl.fit_transform(dist_store)
    bandwidth = estimate_bandwidth(featd, quantile=0.2)

    if bandwidth < .01:
        labelf_select= {0: list(range(featd.shape[0]))}
    else:
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=15)
        ms.fit(featd)
        labelsf = ms.labels_
        labelf_select = {y:[x for x in range(len(labelsf)) if labelsf[x] == y] for y in sorted(list(set(labelsf)))}


    ### 
    ### Distances to focal position by cluster, in number of inds.
    correct_dist= {
        z: [abs(Seq_store[x] - tf_pos) for x in g] for z,g in labelf_select.items()
    }

    ### 
    
    return dist_store, labelf_select, correct_dist, select_same, std_gp_use




def window_exam(featl, samp_keep, select_same, std_gp_use, dist_store, dist_tools, 
                labelf_select= {},correct_dist= {},
               P= 25, dimN= 3, metric= "euclidean",):
    '''
    Local window, parse for select_same inds from target_wdDist
    retain only the observations without avoid_data at this window.
    '''
    #### subselect samples to use at window given missingness
    sample_select, std_gp_use= index_convert2L(samp_keep, select_same, std_gp_use)
    
    #### define grid to use
    Quanted_set= np.array(featl)
    background= get_bg_grid(Quanted_set , P= P, dimN= dimN)
    background= expand_grid(background, expand= expand)


    ###
    ### DISTS for this grid
    workfeat= featl[sample_select,:dimN]
    dist_grid= pairwise_distances(background, workfeat,
                                                metric=metric)

    ###
    like_diet= grid_likelihood(dist_grid,dist_store,dist_tools,
                               labelf_select= labelf_select,
                               std_gp_use= std_gp_use,
                              correct_dist= correct_dist)
    
    return background, like_diet





In [36]:
idvector= ['i{}_pop{}_{}'.format(x,labels1[x],core_pop[x]) for x in range(len(labels1))]
#box_vector= [idvector[x] for x in select_same]


Nwindows= 200 # Number of windows
wind_sizes= 20 # sizes in number of features
ncomps= 3

wind_prox= 1e6

nan_char= [1,9]
code_keep= 2

### seq_store and dist tools
metric= 'euclidean'
dimN= ncomps
Nreps= 80
ind_min= 50

#### Observation clustering tools
comps_dists= 5

#### Background Grid tools
P= 25
expand=1

#### KDE LIKELIHOOD TOOLS
dist_comps= 10
Bandwidth_split = 20
####



In [None]:
#####
##### Functions and tools packages.
### window position select function and arguments
wparse_func= window_parse
wparse_args= {
    'centre': tf_pos,
    'wind_sizes': wind_sizes,
    'wind_prox': wind_prox
}

### 
### local window processing function and arguments
lproc_func= bin_keep
lproc_args= {
    'keep': code_keep
}

###
### local window - which individuals to keep, return boolean
lkeep_func= code_find
lkeep_args= {
    'code_v': nan_char,
    'binned': True,
    'axis': 1
}

###########
###########
from impute_tools.impute_tools import get_bg_grid
from sklearn.metrics import pairwise_distances


process_tools= [
    lproc_func,
    lproc_args
]

keep_tools= [
    lkeep_func,
    lkeep_args
]

#### Feature Var processing tools
varFilt_func= sg_varSel
varFilt_args= {
    'proc': 'none'
}

varFilt_tools= [sg_varSel, varFilt_args]


####
#### KDE LIKELIHOOD TOOLS
from impute_tools.impute_tools import kde_likes_extract

####
####
dist_func= kde_likes_extract
dist_args= {
    'dist_comps': dist_comps
}
dist_tools= [dist_func,dist_args]

###
### PCA object for local windows
dr_obj = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')


In [94]:
#### select position to exclude
####
tfoc= np.where(het_char==1)
tfoc= np.array(tfoc).T

select_idx= np.random.randint(0,tfoc.shape[0],1)[0]
tf= tfoc[select_idx]


nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)[0]
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)[0]

tf= [ynan,xnan]
tf_acc= tf[0]
tf_pos= tf[1]
#tf_acc= 70
tf

[178, 3772]

In [95]:
###
###
wst= wparse_func(summary,**wparse_args)
print('# pos: {}'.format(len(wst)))

###
### 

dist_store, labelf_select, correct_dist, select_same, std_gp_use= get_likes_engine(genotype, wst, tf, 
                     process_tools, keep_tools, varFilt_tools, dist_tools,
                     wind_sizes= wind_sizes, Nreps= Nreps, ncomps= ncomps, nan_char= nan_char, ind_min= ind_min,
                     dimN= dimN, metric= metric)

print(dist_store.shape)

# pos: 289
(81, 384)
(131, 384)
(80, 214)


In [96]:
##
## local window - get and process
local_l= lwind_extract(genotype, idx= tf_pos, wind_sizes= wind_sizes)

## samp keep - samples without het or nan at this window
samp_keep= lkeep_func(local_l, **lkeep_args)
print(sum(samp_keep))

### process local window - convert to haplotypes - keep only code_keep
local_l= lproc_func(local_l,**lproc_args)

##
## local PCA and PCA transform
##
pca_special= dr_obj.fit(local_l[samp_keep])
featl= pca_special.transform(local_l)

##

385


In [97]:
background, like_diet= window_exam(featl, samp_keep, select_same, std_gp_use, dist_store, dist_tools, 
                labelf_select= labelf_select,correct_dist= correct_dist,
               P= P, dimN= dimN, metric= metric,)

4


In [98]:

figwl= [go.Scatter(
    x= featl[label_select[i],0],
    y= featl[label_select[i],1],
    text= [idvector[x] for x in label_select[i]],
    mode= 'markers',
    name= str(i),
    marker= dict(
        size= 10
    )
) for i in label_select.keys()]


figwl.append(go.Scatter(
    mode='markers',
    #x= [pca_w.transform(local_l[tf_acc].reshape(1,-1))[0][0]],
    #y= [pca_w.transform(local_l[tf_acc].reshape(1,-1))[0][1]],
    x=[featl[tf_acc,0]],
    y=[featl[tf_acc,1]],
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size=25,
        opacity= 1,
        line=dict(
            color='blue',
            width=5
        )
    ),
    showlegend=False
))

layout= go.Layout()

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [99]:

tf_rec= recover_hap(background,like_diet,pca_special,
                scale= 1, round_t= True)

tf_proj= pca_special.transform(tf_rec.reshape(1,-1))[0]
#tf_proj= pca_w.transform(local_l[tf_acc].reshape(1,-1))[0]

In [100]:
from plotly import tools
ploidy= 1
title= 'coords'
fig_subplots = tools.make_subplots(rows=1, cols=2,subplot_titles=tuple([title]*2))

for trace in figwl:
    fig_subplots.append_trace(trace, 1, 1)

correct= max(like_diet)

if correct==0:
    correct= 1

if background.shape[1] > 2:
    opac = like_diet / correct 
    opac= opac
else:
    opac= .8

trace= go.Scatter(
    x= background[:,0],
    y= background[:,1],
    #z= grid_likes,
    mode= 'markers',
    marker= {
        'color': like_diet,
        'colorbar': go.scatter.marker.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 25,
        'symbol': 'circle',
      "opacity": opac #like_diet / correct
      }
)

like_max= np.argmax(like_diet)

target_found= go.Scatter(
    mode='markers',
    #x= [pca_w.transform(local_l[tf_acc].reshape(1,-1))[0][0]],
    #y= [pca_w.transform(local_l[tf_acc].reshape(1,-1))[0][1]],
    #x= [background[like_max][0]],
    #y= [background[like_max][1]],
    x= [tf_proj[0]],
    y= [tf_proj[1]],
    name= 'max_predict',
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size= 35,
        opacity= 1,
        line=dict(
            color='red',
            width=5
        )
    ),
    showlegend=False
)

fig_subplots.append_trace(target_found, 1,1)
fig_subplots.append_trace(trace, 1,2)

iplot(fig_subplots)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [101]:
fig_dual= [trace]

fig_dual.extend(figwl)

layout= go.Layout(
    height= 700,
    width= 900
)
Figure= go.Figure(data= fig_dual,layout= layout)
iplot(Figure)

## Result

Our predicted haplotype

In [102]:
tf_rec

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

The original haplotype

In [103]:
local_l[tf_acc]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Continuation

MS select grid refinement to be incorporated. 

Currently this method does not perform the distance profile parse and composite likelihood extraction that the above method does.

The limitation can be seen below.

In [113]:
from impute_tools.impute_tools import (
    nBg_MS, nBg_grid,
    gridWalk
)

dist_ref_select= 0
dist_ref= dist_store[labelf_select[dist_ref_select],:]
dist_ref= dist_store[:,std_gp_use]

P= 40
dimN= 2
N_samps= P**dimN
dist_comps= 10
Bandwidth_split = 30
kernel= 'gaussian'
metric= 'euclidean'


BG_func= nBg_MS
BG_args= {
    'lb':0.05,
    'up':0.6,
    'kernel': kernel,
    'N_samps': 5
}


granted, grid_likes= gridWalk(featl,dist_ref,BG_func, BG_args= BG_args, std_gp_use= std_gp_use,
            P= P,
            dimN= dimN,
            N_samps= N_samps,
            dist_comps= dist_comps,
            Bandwidth_split = Bandwidth_split,
            metric= metric,
            kernel= kernel,
            min_samp= 10)



In [114]:
dist_ref.shape

(80, 214)

In [116]:
from plotly import tools

title= 'coords'
fig_subplots = tools.make_subplots(rows=1, cols=2,subplot_titles=tuple([title]*2))

for trace in figwl:
    fig_subplots.append_trace(trace, 1, 1)
    

trace= go.Scatter(
    x= granted[:,0],
    y= granted[:,1],
    #z= grid_likes,
    mode= 'markers',
    marker= {
        'color':grid_likes,
        'colorbar': go.scatter.marker.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 8,
        'symbol': 'circle',
      "opacity": 1
      }
)

fig_subplots.append_trace(trace, 1,2)

iplot(fig_subplots)