In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import euclidean_distances

import pandas as pd
import numpy as np

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection

from random import sample
from random import shuffle


import Modules_tools

### MeanShift for imputation of Missing data.

**Steps**:

- PCA
- Mean Shift clustering
- KNeighbors supervised classification (*)

*** NN line trailing. results are less good.

**Data generation**:

- Populations: Frequency vectors. Beta distribution (scipy implementation).
- Observations: Binomial draws from population vectors.

**Context**

Imputation of missing data is an inference problem. Assuming that a probability density function of a
variable can be derived from the data, as estimate can be made of the value of a new observation.

Across a large data set, it is rare that all samples originate from a single population. Factor variables are
usually provided with the data so that the appropriate corrections can be made.

In big data, and in genomics in particular, it is increasingly the case that such corrections have to 
be made across many data sets for which categorical variables are not available, or untrustworthy.

This context requires automated approaches to perform operations (summary statistics, imputation), that rely
on unsupervised classifications to produce the necessary factors.

In this repository, we explore the use of Mean Shift clustering to impute binary data in structure populations.

In [2]:
# Simulate frequency vectors. 
# We must first define the number of populations, the length of the haplotypes desired, and their respective population sizes
L= 150

import itertools as it
n= 200

# Vary a (beta distribution parameter).
a_range= np.linspace(1,2,20)
a_set= [i for i in a_range for _ in range(n)]

# vary b.
b_range= np.linspace(0.1,.4,20)
b_set= [i for i in b_range for _ in range(n)]

## length of haplotypes to extract.
L_set= [L] * n * 20


background_1= np.array([a_set,b_set,L_set]).T

vector_lib= []
for k in range(background_1.shape[0]):
    
    probs= beta.rvs(background_1[k,0], background_1[k,1], size=int(background_1[k,2]))
    probs[(probs > 1)]= 1
    
    
    vector_lib.append(probs)

vector_lib= np.array(vector_lib)

In [3]:
print('Number of frequency vectors of size {} generated: {}'.format(vector_lib.shape[1],vector_lib.shape[0]))


Number of frequency vectors of size 150 generated: 4000


In [4]:
## PCA on vectors simulated
n_comp = 100

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)# * pca.explained_variance_ratio_
var_comps= pca.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(features.shape))

PC1: 0.019; PC2: 0.009; PC3: 0.009; PC4: 0.009; PC5: 0.009; PC6: 0.009; PC7: 0.009; PC8: 0.009; PC9: 0.009; PC10: 0.009; PC11: 0.009; PC12: 0.009; PC13: 0.009; PC14: 0.009; PC15: 0.008; PC16: 0.008; PC17: 0.008; PC18: 0.008; PC19: 0.008; PC20: 0.008; PC21: 0.008; PC22: 0.008; PC23: 0.008; PC24: 0.008; PC25: 0.008; PC26: 0.008; PC27: 0.008; PC28: 0.008; PC29: 0.008; PC30: 0.008; PC31: 0.008; PC32: 0.008; PC33: 0.008; PC34: 0.008; PC35: 0.008; PC36: 0.008; PC37: 0.008; PC38: 0.007; PC39: 0.007; PC40: 0.007; PC41: 0.007; PC42: 0.007; PC43: 0.007; PC44: 0.007; PC45: 0.007; PC46: 0.007; PC47: 0.007; PC48: 0.007; PC49: 0.007; PC50: 0.007; PC51: 0.007; PC52: 0.007; PC53: 0.007; PC54: 0.007; PC55: 0.007; PC56: 0.007; PC57: 0.007; PC58: 0.007; PC59: 0.007; PC60: 0.007; PC61: 0.007; PC62: 0.007; PC63: 0.007; PC64: 0.007; PC65: 0.007; PC66: 0.007; PC67: 0.007; PC68: 0.007; PC69: 0.007; PC70: 0.006; PC71: 0.006; PC72: 0.006; PC73: 0.006; PC74: 0.006; PC75: 0.006; PC76: 0.006; PC77: 0.006; PC78: 0.

### MRCA - Most Recent Common Ancestor.

The following block serves to tie all the populations in the vector data set together.

The random generation of frequency vectors creates vectors distinct along, assymptotically, all possible directions.

Here, we limit the number of possible directions, by creating a data set made entirely of vectors generated as described for the manipulation of genetic distances, i.e. from equally distant coordinates between two initial projections. We continue to rely on pairs of initial projections. However, here, only one projection is made to vary, while the other is chosen beforehand and remains the same. 

The result is the starshaped distribution observed in the next graph.


In [5]:
Iter= 50
target= [0,1]
stairs= 4

MRCA= np.random.choice(range(vector_lib.shape[0]),1)
calypso= []
feat= []

for inter in range(stairs):
    Pair= np.random.choice(range(vector_lib.shape[0]),2,replace= False)
    Pair[1]= MRCA
    print(Pair)
    
    coords= features[Pair,:]
    
    vector2= coords[target[1]] - coords[target[0]]
    for angle in np.linspace(-20,20,Iter):
        new_guy = coords[target[0]] + [angle / 10 * x for x in vector2]
        
        feat.append(new_guy)
        
        new_guy= pca.inverse_transform(new_guy)
        new_guy[new_guy < 0]= 0
        new_guy[new_guy > 1]= 1
        
        calypso.append(new_guy)

#features= np.array(feat)
#vector_lib= np.array(calypso)
print(vector_lib.shape)

[962 182]
[385 182]
[1606  182]
[2883  182]
(4000, 150)


In [6]:
#### Let's plot the first 3 coordinates nonetheless.
####
fig_data= [go.Scatter3d(
        x = features[:,0],
        y = features[:,1],
        z = features[:,2],
        type='scatter3d',
        mode= "markers",
        text= ['a: {}; b: {}, L: {}; index = {}'.format(background_1[k,0],background_1[k,1],background_1[k,2], k) for k in range(background_1.shape[0])],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
      "opacity": .8
      }
    )]


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    scene= Scene(
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= 'PC3: {}'.format(round(var_comps[2],3))))
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)


**PCA of frequency vectors generated**

The next block defines functions to introduce missing data, and optimize clean data sets for imputation.

In [7]:

def introduce_error(data,data_structure,error_structure):
    
    new_data= []
    Na_tree= {}
    
    for mark in range(data.shape[1]):
        
        new_row= data[:,mark]
        where_na=[]
        
        for gp in data_structure.keys():
            whose= data_structure[gp]
            
            
            na_idx= [np.random.choice([0,1],p= error_structure[gp]) for x in range(len(whose))]
            na_idx= [data_structure[gp][x] for x in range(len(na_idx)) if na_idx[x] == 1]
            
            where_na.extend(na_idx)
        
        dict_na= {x:data[x,mark] for x in where_na}
        
        new_row= np.array(new_row)
        new_row[where_na] = np.nan
        
        new_data.append(new_row)
        
        Na_tree[mark]= dict_na
    
    new_data= np.array(new_data).T
    
    return new_data, Na_tree


def introduce_error_FV(data,data_structure,error_structure):
    
    new_data= []
    Na_tree= {}
    
    for mark in range(data.shape[1]):
        
        new_row= data[:,mark]
        where_na=[]
        
        for gp in data_structure.keys():
            whose= data_structure[gp]
            
            
            na_idx= [np.random.choice([0,1],p= error_structure[gp][mark]) for x in range(len(whose))]
            na_idx= [data_structure[gp][x] for x in range(len(na_idx)) if na_idx[x] == 1]
            
            where_na.extend(na_idx)
        
        dict_na= {x:data[x,mark] for x in where_na}
        
        new_row= np.array(new_row)
        new_row[where_na] = np.nan
        
        new_data.append(new_row)
        
        Na_tree[mark]= dict_na
    
    new_data= np.array(new_data).T
    
    return new_data, Na_tree



def optimize_size(SNP_dict,ladder,burn,M,N):
    '''
    a silly way of obtaining a good representation
    '''
    #Penal= 1 / float(max([len(x) for x in SNP_dict.values()]))
    max_na= float(max([len(x) for x in SNP_dict.values()]))
    
    if max_na == 1:
        Penal= 0
    if max_na == 0:
        return {1:[x for x in SNP_dict]}
    if max_na > 1:
        Penal= 1 / max_na
    
    Snp_org= sorted(SNP_dict, key=lambda k: len(SNP_dict[k]), reverse=True)
    
    ## this is the part that is utterly simplified for the moment.
    ## Sampling of SNps will be interated and probabilities will be penalized by the proportion 
    ## of missing data.
    Ps= [1 - len(SNP_dict[x]) * Penal for x in Snp_org]
    Ps= np.array([x / float(sum(Ps)) for x in Ps])
    Ps[Ps < 0] = 0
    
    ### There must be a better way.
    ###
    if sum(Ps) > 2:
        print('Sum: {}'.format(sum(Ps)))
        diff= 1 - sum(Ps)
        where_safe= [x for x in range(len(Ps)) if Ps[x] + diff >= 0 and Ps[x] + diff <= 1]
        if not where_safe:
            print(diff)
        where_to= np.random.choice(where_safe,1)[0]
        Ps[where_to]+= diff
        print('Sum processed: {}'.format(sum(Ps)))
    
    
    Void= [x for x in range(len(Ps)) if Ps[x] == 0]
    Void= len(Void)
    
    #print(Ps)
    
    Records= defaultdict( list )
    
    for cp in range(Void,Void +ladder):
        if cp >= len(Snp_org):
            continue
        for ring in range(burn):
            
            Select= np.random.choice(range(len(Snp_org)), len(Snp_org) - cp, replace=False, p=Ps)
            Select= [Snp_org[x] for x in Select]

            affected= list(set([z for z in it.chain(*[SNP_dict[x] for x in Select])]))

            SizeA= ((M - len(affected)) / float(M))
            SizeB= len(Select) / float(N)
            
            ## This is the variable we estimate quality. 
            ## In this case the relative size of the data set.
            ## could be improved.
            Size= SizeA * SizeB
            
            ## round
            Size= round(Size,3)

            Records[Size].append(Select)
    
    
    return Records


###
###
###

def extract_profiles_class(global_data,target_ind_dict):
    '''
    copy of the previous function. change of name to deal with local 
    function similarity.
    '''
    ## estimate the bandwith
    cluster_profiles= recursively_default_dict()
    params = {'bandwidth': np.linspace(np.min(global_data), np.max(global_data),20)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
    
    combine= {}
    for bull in target_ind_dict.keys():
        
        Quanted_set= global_data[target_ind_dict[bull],:]
        grid.fit(Quanted_set)
        kde = grid.best_estimator_
        
        P_dist = kde.score_samples(Quanted_set)
        Fist = kde.score_samples(global_data)

        ## Normalizing log-likelihood estimates by those of the reference set and extracting their cdf.
        Fist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Fist)
        cluster_profiles[bull]=Fist

    
    return cluster_profiles
    
###
###
###


###  I.Static simulations.

Imputation of a single structured data set.

i. Select populations from `vector_lib` allele frequency data set in `Pops` list. 

ii. Population sample sizes in `Sizes` list.

iii. Number of variables: `L`.



In [8]:
#### produce populations and calculate Fsts.
import Modules_tools

Pops= [75,1,122]
Sizes= [150,130,90]


N_pops= len(Pops)

L= 150

labels= np.repeat(np.array([x for x in range(N_pops)]),Sizes)

Vectors= vector_lib[Pops,:]

data= []

for k in range(N_pops):
    
    probs= Vectors[k,:]
    
    probs[(probs > 1)]= 1
    
    m= Sizes[k]
    Haps= [[np.random.choice([float(1),float(0)],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]
    
    data.extend(Haps)

data= np.array(data)
print(data.shape)

(370, 150)


In [23]:
print('Global proportion of 0 {}'.format(1 - np.sum(data) / (data.shape[0] * data.shape[1])))


Global proportion of 0 0.914018018018018


### II. Introducing missing data.

Introduce missing data using a binomial probability across markers or assuming a Beta distribution of missinge data frequencies.

In [21]:

#### introduce missing data
Pops_imp= Sizes
N_imp= L
Imp_labs= {x:[sum(Pops_imp[:x]) + z for z in range(Pops_imp[x])] for x in range(len(Pops_imp))}

### Normally distributed
Imps_probs= [0.99,0.98,0.99]
Imps_Pdict= {
    x: [Imps_probs[x], 1 - Imps_probs[x]] for x in Imp_labs.keys()
}

## Beta distributed

Imps_Pdict_beta= {
    z: [[x,1 - x] for x in beta.rvs(background_1[k,0], background_1[k,1], size=int(background_1[k,2]))] for z in Imp_labs.keys()
}

data_zeros= np.zeros(shape=(sum(Pops_imp),N_imp))


#data_treat, NA_tree= introduce_error_FV(data,Imp_labs,Imps_Pdict_beta) 
data_treat, NA_tree= introduce_error(data,Imp_labs,Imps_Pdict) 
print(1 - np.nansum(data_treat) / (data_treat.shape[0] * data_treat.shape[1]))


0.915171171171


### III. Correcting missing data at for a single variable.

For development and as an initial test, we will impute data at a single site (variable).

All missing observations at the selected site will be recorded.

The remaining data set will be surveyed for the best possible combination of clean observations and variables with which to perform inference.

The process of selection is run by the function `optimize_size`. This process is heuristic and depends only on the arguments `burn`and `ladder`.

#### Optimize_size

This function explores random combinations of tests. The probability of drawing each variable is balanced by the number of missing data it holds. For each set drawn the proportion of clean observations is multiplied by the proportion of variables selected. The score of each trial is stored along with the markers used. The resulting {score :: variables} dictionary is returned.

- **burn**: number of trials to perform for a a given sample size.
- **ladder**: range of reductions in number of variables selected (sample size).


In [11]:
from collections import defaultdict


### Selected variable (Index).
mark= 12

### Impute missing data
###

NA_as= np.argwhere(np.isnan(data_treat))
print('total initial NA: {}'.format(len(NA_as)))
print('proportion: {}'.format(len(NA_as) / (data.shape[0] * data.shape[1])))

NA_lib= defaultdict( list )

for n,v in NA_as:
    NA_lib[v].append(n)

for n in range(data_treat.shape[1]):
    if n not in NA_lib.keys():
        NA_lib[n]= []


N_lib_sort= sorted(NA_lib, key=lambda k: len(NA_lib[k]), reverse=True)

print(len(NA_lib))

touched= NA_lib[mark]
print(NA_tree[mark])

SNP_clean_target= {x:NA_lib[x] for x in range(data_treat.shape[1]) if x if len([y for y in touched if y in NA_lib[x]]) == 0}

ladder= 50
burn= 200

records= optimize_size(SNP_clean_target,ladder,burn,data_treat.shape[0],data_treat.shape[1])

total initial NA: 763
proportion: 0.013747747747747749
150
{14: 0.0, 40: 0.0, 153: 1.0, 202: 1.0, 223: 0.0, 275: 1.0}


In [25]:
len('number of variables with no Nan values at the positions in the selected variable: {}'.format(len(SNP_clean_target)))

85

In [26]:
print(sorted(records.keys()))
Chose_set= max(records.keys())
Chose_set= [x for x in records[Chose_set][0]]

affected= list(set([x for x in it.chain(*[SNP_clean_target[z] for z in Chose_set])]))
not_affected= [x for x in range(data_treat.shape[0]) if x not in affected]

Clean_set= data_treat[:,Chose_set]
Clean_set= Clean_set[not_affected,:]

Clean_set.shape



[0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.172, 0.173, 0.174, 0.175, 0.176, 0.177, 0.178, 0.179, 0.18, 0.181, 0.182, 0.183, 0.184, 0.185, 0.186, 0.187, 0.188, 0.189, 0.19, 0.191, 0.192, 0.193, 0.194, 0.195, 0.196, 0.197, 0.198, 0.199, 0.2, 0.201, 0.202, 0.203, 0.204, 0.205, 0.206, 0.207, 0.208, 0.209, 0.21, 0.211, 0.212, 0.213, 0.214, 0.215, 0.216, 0.217, 0.218, 0.219, 0.22, 0.221, 0.222, 0.223, 0.224, 0.225, 0.226, 0.227, 0.228, 0.229, 0.23, 0.231, 0.232, 0.233, 0.234, 0.235, 0.236]


(131, 100)

**above:** range of scores obtained during testing procedure.

In [16]:
n_comp = 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(Clean_set)

#### MeanShift clusters

bandwidth = estimate_bandwidth(features, quantile=0.2)
params = {'bandwidth': np.linspace(np.min(features), np.max(features),20)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

## perform MeanShift clustering.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all= True, min_bin_freq=15)
ms.fit(features)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

###
iu_control= np.triu_indices(len(label_select),1)
MS_centroids= [np.mean(features[label_select[z],:],axis= 0) for z in label_select.keys()]
###

from sklearn import neighbors

weights= 'distance'

print('cluster size: {}'.format([len(x) for x in label_select.values()]))

###
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(10,3), random_state=1)

###

for cp in label_select.keys():

    train_keys= [not_affected[x] for x in label_select[cp] if not_affected[x] not in touched]
    test_keys= [not_affected[x] for x in label_select[cp] if not_affected[x] in touched]

    neibh_opt= 10
    if len(train_keys) < neibh_opt:
        n_neighbors= len(train_keys)
    if len(train_keys) > neibh_opt:
        n_neighbors= neibh_opt

    if len(label_select[cp]) >= 5:
        
        y_train= data_treat[train_keys,mark]
        
        
        X_train= np.array([Clean_set[x,:] for x in label_select[cp] if not_affected[x] not in touched])
        X_test= np.array([Clean_set[x,:] for x in label_select[cp] if not_affected[x] in touched])
        if len(X_test) == 0:
            continue
        
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)
        
        Z = clf.predict(X_test)
        
        print('concerned: {}'.format(test_keys))
        
        print('predicted: {}'.format(Z))

print('truth: {}'.format(NA_tree[mark]))


cluster size: [77, 39, 34]
concerned: [14, 40]
predicted: [ 0.  0.]
concerned: [153, 202, 223, 275]
predicted: [ 1.  1.  1.  1.]
truth: {14: 0.0, 40: 0.0, 153: 1.0, 202: 1.0, 223: 0.0, 275: 1.0}


**above** missing data at the variable selected were inferred. All other variables were first surveyed for a clean data set suitable for inference. The selected data set was reduced using PCA, and the first 3 components selected. All samples were clustered using the Mean Shift algorithm. Missing data at the variable selected was inferred using cluster specific samples. Missing data born by samples falling in clusters of size < 3 were not imputed.


### IV.  Imputation of an entire data sets.

The same procedure applied in III. will now be conducted across all markers bearing missing data.

- `Ot`: a minimum number of observed values. Missing data at a variable bearling a number of observed values below this threshold will be ignored.


In [17]:
#####
##### getting everything together
predicted_out= []
truth_out= []

### Skip threshold
### skip markers with less than Ot observed variables.
Ot= 5

### Impute missing data
###

NA_as= np.argwhere(np.isnan(data_treat))
print('total initial NA: {}'.format(len(NA_as)))

NA_lib= defaultdict( list )

for n,v in NA_as:
    NA_lib[v].append(n)

for n in range(data_treat.shape[1]):
    if n not in NA_lib.keys():
        NA_lib[n]= []


N_lib_sort= sorted(NA_lib, key=lambda k: len(NA_lib[k]), reverse=True)

print(len(NA_lib))

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(10,3), random_state=1)

Summary= []

Accuracy= np.nan

for snp in range(len(N_lib_sort)):
    mark= N_lib_sort[snp]
    if NA_lib[mark]:
        #Present= NA_lib[mark]
        touched= NA_lib[mark]
        #print(len(touched))
        #for touch in Present:
        #touched= [touch]
        
        #print(NA_tree[mark])
        #print(data_treat[touched,mark])

        SNP_clean_target= {x:NA_lib[x] for x in range(data_treat.shape[1]) if len([y for y in touched if y in NA_lib[x]]) == 0}

        ladder= 80
        burn= 100

        Chose_set= [x for x in SNP_clean_target.keys()]

        if sum([z for z in it.chain(*SNP_clean_target.values())]) > 0:
            #print(len(SNP_clean_target))
            records= optimize_size(SNP_clean_target,ladder,burn,data_treat.shape[0],data_treat.shape[1])

            Chose_set= max(records.keys())

            Chose_set= [x for x in records[Chose_set][0]]

        affected= list(set([x for x in it.chain(*[SNP_clean_target[z] for z in Chose_set])]))
        not_affected= [x for x in range(data_treat.shape[0]) if x not in affected]
        #print(data_treat[not_affected,mark])

        Clean_set= data_treat[:,Chose_set]
        Clean_set= Clean_set[not_affected,:]

        #print(Clean_set.shape)

        n_comp = 3

        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
        features = pca.fit_transform(Clean_set)

        #### MeanShift clusters

        bandwidth = estimate_bandwidth(features, quantile=0.15)
        params = {'bandwidth': np.linspace(np.min(features), np.max(features),20)}
        grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

        ## perform MeanShift clustering.
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all= True, min_bin_freq=10)
        ms.fit(features)
        labels1 = ms.labels_
        label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}
        #label_select= {0:[x for x in range(len(labels1))]}
        ### why not
        iu_control= np.triu_indices(len(label_select),1)
        MS_centroids= [np.mean(features[label_select[z],:],axis= 0) for z in label_select.keys()]
        ###

        ###

        #print('cluster size: {}'.format([len(x) for x in label_select.values()]))

        ###
        local_truth= []
        local_pred= []
        
        
        for cp in label_select.keys():

            train_keys= [not_affected[x] for x in label_select[cp] if not_affected[x] not in touched] # Present
            if len(train_keys) <= Ot:
                continue

            test_keys= [not_affected[x] for x in label_select[cp] if not_affected[x] in touched]

            y_train= data_treat[train_keys,mark]

            from sklearn import neighbors
            neibh_opt= 15
            weights= 'distance'

            if len(train_keys) < neibh_opt:
                n_neighbors= len(train_keys)
            if len(train_keys) > neibh_opt:
                n_neighbors= neibh_opt

            if len(label_select[cp]) > 3:

                X_train= np.array([Clean_set[x,:] for x in label_select[cp] if not_affected[x] not in touched]) # Present
                #X_train= Clean_set[train_keys,:]
                X_test= np.array([Clean_set[x,:] for x in label_select[cp] if not_affected[x] in touched])
                if len(X_test) == 0:
                    continue

                clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
                clf.fit(X_train, y_train)

                Z = clf.predict(X_test)

                truth_out.extend([NA_tree[mark][x] for x in test_keys])
                #print('concerned: {}'.format(test_keys))
                predicted_out.extend(Z)
                
                local_truth.extend([NA_tree[mark][x] for x in test_keys])
                local_pred.extend(Z)

                #for rev in range(len(test_keys)):
                #    data_treat[rev,mark]= Z[rev]
                
                #Stat= []

                if len(predicted_out) >= 5:
                    Accuracy= 1 - len([x for x in range(len(truth_out)) if truth_out[x] != predicted_out[x]]) / len(truth_out)
                    #print(Accuracy)
        
        ###
        ### Print
        if local_truth:
            local_accuracy= len([x for x in range(len(local_truth)) if local_truth[x] == local_pred[x]]) / float(len(local_truth))
            CPs= '-'.join([str(len(x)) for x in label_select.values()])
            
            Summary.append([snp,mark,len(touched),Clean_set.shape[0],Clean_set.shape[1],CPs,Accuracy,local_accuracy])
            #print(Summary[-1])
        
        
        ###
        ###

Summary= np.array(Summary)

print(Summary.shape)

summary_table= pd.DataFrame(Summary, columns= ['idx','snp','NA','Nused','M','MSamples','Accuracy','local acc.'])


total initial NA: 763
150
(149, 8)


**Summary Table columns** 

- *idx*: order of analysis. Variables were analysed in order of decreasing missing data.*

- *snp*: index of variable selected.

- *NA*: Number of missing observations at variable selected.

- *Nused*: Number of variables used for inference.

- *M*: Total number of observations used for inference.

- *MSamples*: Size of Mean Shift clusters inferred.

- *Accuracy*: incremental estimate of accuracy.

- *local acc.*: Accuracy estimate at the locus analysed.

In [18]:

summary_table

Unnamed: 0,idx,snp,NA,Nused,M,MSamples,Accuracy,local acc.
0,0,34,10,150,93,53-23-24-22-21-6-1,1.0,1.0
1,1,87,10,161,87,59-32-23-22-19-6,1.0,1.0
2,2,50,10,166,79,63-30-24-22-19-7-1,1.0,1.0
3,3,115,10,159,84,78-45-36,1.0,1.0
4,4,7,10,166,81,59-31-27-21-22-6,1.0,1.0
5,5,146,9,153,90,79-36-27-10-1,1.0,1.0
6,6,16,9,164,82,56-29-32-36-11,1.0,1.0
7,7,100,9,160,85,82-45-27-6,1.0,1.0
8,8,91,9,176,76,87-35-32-14-6-2,0.9512195121951219,0.5
9,9,1,9,158,84,79-30-26-18-5,0.9560439560439561,1.0


### V. Imputation and genetic structure.

In [None]:
############################################################################
############################################################################
##### Simulations and such #################################################

#######################################
 ######### Two-way admixture #########
#######################################

Sizes= [120,100,80]

Origins= {
    x:{
        y:[int(z == x) for z in range(len(Sizes) - 1)] for y in range(Sizes[x])
    } for x in range(len(Sizes) - 1)
}

import random

Admixed_proportions= [random.randrange(0,100) / float(100) for x in range(Sizes[-1])]

Origins[len(Sizes) - 1]= {
    y: [Admixed_proportions[y], 1 - Admixed_proportions[y]] for y in range(Sizes[-1])
}


#### we can store these in a way that will facilitate later requests:
Whose= []
ind_to_group= {}
label_vector= []
d= 0

for gp in Origins.keys():
    for acc in range(len(Origins[gp])):
        Whose.append(d)
        ind_to_group[d]= [gp,acc]
        label_vector.append(gp)
        d += 1


labels= [0,1]
target= [0,1]
Chr= 1
color_ref= ['red','yellow','blue','black','orange','purple','green','silver','red3','deepskyeblue','navy','chartreuse','darkorchid3','goldenrod2']

COp= 5e-3

label_indicies= {x:[y for y in range(len(label_vector)) if label_vector[y] == x] for x in Origins.keys()}

Windows= recursively_default_dict()

haplotypes= {x:[] for x in Whose}
Out= {1:{}}
Ideo= []
Blocks= {1:{}}
Fst_windows= []
Fst_crawl= []
Fst_labels= []

target_indx= {z:[x for x in range(len(label_vector)) if label_vector[x] == z] for z in labels}
threshold= .005
P= 30

current= recursively_default_dict()

for angle in np.arange(1,20,.1):
    bl= int(angle*10000)
    end= bl+ 999
    Out[1][bl]= end
    coords= features[Pops,:]
    vector2= coords[target[1]] - coords[target[0]]
    
    coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2]
    
    new_freqs= pca.inverse_transform(coords)
    scramble= [x for x in range(new_freqs.shape[1])]
    shuffle(scramble)
    new_freqs[target[0]]= pca.inverse_transform(coords[target[0]])
    
    new_freqs= new_freqs[:,scramble]
    
    N_pops= len(Pops)
    
    data= []
    local_labels= []
    
    for acc in range(len(Whose)):
        Subject = 'sample' + str(acc)
        
        transition_p= Origins[ind_to_group[acc][0]][ind_to_group[acc][1]]
        
        if current[acc]:
            cross_over= np.random.choice([0,1], p=[1-COp,COp])
            if cross_over == 1:
                k= np.random.choice(labels, p=transition_p)
                current[acc]= k
            else:
                k= current[acc]
        else:
            k= np.random.choice(labels, p=transition_p)
            current[acc]= k
        
        probs= new_freqs[k,:]
        
        probs[(probs > 1)]= 1
        probs[(probs < 0)]= 0
        
        Haps= [np.random.choice([0,1],p= [1-probs[x],probs[x]]) for x in range(L)]
        
        Stock = ['Region_'+str(Chr)+ '_' + Subject,bl,end,color_ref[k]]
        Ideo.append(Stock)
        data.append(Haps)
        local_labels.append(k + 1)
    
    data= np.array(data)
    
    for hap in range(data.shape[0]):
        haplotypes[hap].extend(data[hap,:])
    
    pca2 = PCA(n_components=3, whiten=False,svd_solver='randomized')
    
    data= pca2.fit_transform(data)
    
    profiles= Modules_tools.extract_profiles(data,target_indx)
    
    ### get population fsts
#    Pairwise= return_fsts2(new_freqs)
#    Fst_labels.extend(Pairwise.pops)
    
#    Fst_crawl.extend(Pairwise.fst)
    
#    Fst_windows.extend([bl] * Pairwise.shape[0])
    ### store stuff.
    Blocks[1][bl]= local_labels
    Windows[bl]= profiles


Windows= {1:Windows}
