In [3]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [21]:
from structure_tools.vcf_geno_tools import simple_read_vcf

vcf_file= 'data_cleanRefs_Gap01_Admx.vcf'

genotype, summary, info_save= simple_read_vcf(vcf_file,row_info= 5,header_info= 9,phased= True)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

Number of markers: 40000
Number of individuals: 130


In [22]:
summary.head()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,1,37,1,A,T,.,PASS,.,GT:AD:DP
1,1,135,2,A,T,.,PASS,.,GT:AD:DP
2,1,149,3,A,T,.,PASS,.,GT:AD:DP
3,1,252,4,A,T,.,PASS,.,GT:AD:DP
4,1,293,5,A,T,.,PASS,.,GT:AD:DP


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [23]:
from structure_tools.Tutorial_subplots import plot_global_pca

## Perform PCA
n_comp= 3
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(genotype)

In [24]:
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=45)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}
###

In [25]:
###
plot_global_pca(feats,label_select,PCA_color_ref,title= 'global_pca',height= 500,width= 950)

In [26]:
select_refs= [0,1,2,4]
label_vector= [[max(select_refs)+1,labels1[x]][int(labels1[x] in select_refs)] for x in range(genotype.shape[0])]

Whose= list(range(genotype.shape[0]))


In [27]:
summary.shape

(40000, 9)

In [28]:
SequenceStore= {}

Nwindows= 100
Wsizes= 100
chrom= 1
wst= np.random.randint(0,genotype.shape[1] - Wsizes,size= Nwindows)
wst= np.linspace(0,genotype.shape[1] - Wsizes,Nwindows,dtype= int)
SequenceStore= {
    chrom: {summary.POS[st]: genotype[:,st:(st+Wsizes-1)] for st in wst}
}

Out= {chrom: {summary.POS[st]: summary.POS[st+ Wsizes - 1]for st in wst}}



In [29]:
supervised= True

Bandwidth_split= 30 # grid split for kde 
KDE_comps= 4 # PCA components to retain
clsize= 15 # minimum cluster size to retain during ms clustering
control_sampling= False
control_N= 100
amova= True # whether to calculate amova.

In [30]:
from structure_tools.StructE_tools import findPhiPT, Structure_profiles, Distance_profiles

from structure_tools.AMOVA_func import amova_cofactor, AMOVA_FM42
from structure_tools.mstutorial_tools import Windows_KDE_amova

### Perform Distance and association analysis on the data sets generated
ref_gps= [0,1,2]

refs_lib= {v:g for v,g in label_select.items() if v in ref_gps}
admx_lib= {v:g for v,g in label_select.items() if v not in ref_gps}
admx_lib.update(refs_lib)
import itertools as it

Results, Construct, PC_var= Windows_KDE_amova(SequenceStore,
                                              admx_lib,
                                              refs_lib,
                                              supervised= True,
                                              amova= True,
                                              ncomps= KDE_comps,
                                              clsize= clsize,
                                              Bandwidth_split= Bandwidth_split)

chr 1, where: 1995078, supervised: True, n clusters: 3
old: ; jaccard: 0.10551612751901025; PCA euc: 0.40058482526777484; nHam: 0.20029415642391907


In [31]:
AMOVA_stats= [[[Chr,wind,*Results[Chr][wind]] for wind in Results[Chr].keys()] for Chr in Results.keys()]
AMOVA_stats= np.array([y for y in it.chain(*AMOVA_stats)])

Names= ['updt jaccard','updt euc','updt hamming']

fig_data= [go.Scatter(
    x= AMOVA_stats[:,1],
    y= AMOVA_stats[:,x],
    mode= 'markers',
    name= Names[x - 3]
) for x in range(3,6)]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='Windows')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [32]:
select_refs= [0,1,2,4]
label_vector= [[len(select_refs),labels1[x]][int(labels1[x] in select_refs)] for x in range(genotype.shape[0])]

Whose= list(range(genotype.shape[0]))


In [33]:
Names=['id' + str(x) for x in range(len(Whose))]
Fam= {
    Names[x]:x for x in range(len(Names))
}

Fam.update({
    x:Names[x] for x in range(len(Names))
})

###
Dr_dim= 3

###
focus_subset= False
Geneo= admx_lib
Focus_group= 0

Focus= [Names[x] for x in Geneo[Focus_group]]

###
Dr_var= 'all'
target_var= [0]

##

In [34]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

Method= 'MeanShift'

Cl_store= {
    'MeanShift':{
        'Clusterfunc': MeanShift,
        'cluster_kwargs': {
            'bin_seeding': False,
            'cluster_all': True,
            'min_bin_freq': 15
        }
    },
    'DBscan':{
        'Clusterfunc': DBSCAN,
        'cluster_kwargs': {
            'min_samples': 15
        }
    },
    'Ward':{
        'Clusterfunc': AgglomerativeClustering,
        'cluster_kwargs': {
            'linkage': 'ward',
            'n_clusters': 4
        }
    },
    'Kmeans':{
        'Clusterfunc': KMeans,
        'cluster_kwargs': {
            'random_state': 0,
            'n_clusters': 3
        }
    }
}

In [35]:
from structure_tools.mstutorial_tools import MAC_process

preProc_Clover, Cameo, Coordinates, COMPS, X_se, label_select, Subset, labels1= MAC_process(Construct,
                                                                             Out,
                                                                             Cl_store,
                                                                             refs_lib,
                                                                             Fam,
                                                                             Names= Names,
                                                                             target_var= target_var,
                                                                             Dr_var= Dr_var,
                                                                             focus_subset= focus_subset,
                                                                             Focus= Focus,
                                                                             Dr_dim= Dr_dim,
                                                                             Method= Method)

Clover shape:  (280, 130)
Clover shape:  (280, 130)
focusing Dr on all


In [36]:
from plotly import tools
from structure_tools.mstutorial_tools import KDE_pca

KDE_pca(feats= feats,Cameo= Cameo,label_vector= label_vector,Subset= Subset, 
       Col_vec= PCA_color_ref)

['Global', 'Global', 'cluster 1', 'cluster 1', 'cluster 2', 'cluster 2', 'cluster 3', 'cluster 3', 'cluster 4', 'cluster 4']



plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



### Reconstruct actual tree. 

Chose nodes to construct the tree with.

In [74]:
cluster_include= [0,1,2]

label_keep= {z:g for z,g in label_select.items() if z in cluster_include}

In [79]:


def MS_get_norm(Sequences,refs_lib,ncomps= 4,clsize= 15,Bandwidth_split= 20,
               pca_qtl= 0.2):
    '''
    Perform PCA + Mean Shift across windows. Extract Meanshift p-value vectors. Perform amova (optional).
    '''

    pca = PCA(n_components=ncomps, whiten=False,svd_solver='randomized').fit(Sequences)
    data = pca.transform(Sequences)

    params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0,cv= 3, iid= False)

    ######################################
    ####### TEST global Likelihood #######
    ######################################
    Focus_labels = [z for z in it.chain(*refs_lib.values())]

    #### Mean Shift approach
    ## from sklearn.cluster import MeanShift, estimate_bandwidth

    bandwidth = estimate_bandwidth(data, quantile= pca_qtl, n_samples=len(Focus_labels))
    if bandwidth <= 1e-3:
        bandwidth = 0.1

    ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=clsize)
    ms.fit(data[Focus_labels,:])
    labels = ms.labels_


    Tree = {x:[Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1]}
    Keep= [x for x in Tree.keys() if len(Tree[x]) > clsize]

    Tree= {x:Tree[x] for x in Keep}
    Ngps= len(Tree)

    ### Extract MScluster likelihood by sample

    dist_store= {}

    for hill in Tree.keys():

        grid.fit(data[Tree[hill],:])

        # use the best estimator to compute the kernel density estimate
        kde = grid.best_estimator_

        # normalize kde derived log-likelihoods, derive sample p-values
        P_dist = kde.score_samples(data[Tree[hill],:])
        Dist = kde.score_samples(data)
        P_dist= np.nan_to_num(P_dist)
        Dist= np.nan_to_num(Dist)
        if np.std(P_dist) == 0:
            Dist= np.array([int(Dist[x] in P_dist) for x in range(len(Dist))])
        else:
            Dist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Dist)
            Dist= np.nan_to_num(Dist)
            dist_store[hill]= Dist
    
    return Tree,dist_store


def kde_gen_dict(data,label_dict):
    '''
    create dictionary of group kde generators in data space.
    '''
    
    params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
    grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0,cv= 3, iid= False)

    ref_gens= {}

    for hill in label_dict.keys():

        grid.fit(data[label_dict[hill],:])
        # use the best estimator to compute the kernel density estimate
        kde = grid.best_estimator_
        ref_gens[hill]= kde
    
    return ref_gens

def gen_class(samples,ref_generators,lb= 1e-3,out_code= -1):
    '''
    use kde generators in dictionary to score and classify samples.
    '''
    
    ref_keys= list(ref_generators.keys())
    score_dict= {z: g.score_samples(samples) for z,g in ref_generators.items()}
    
    score_array= [score_dict[z] for z in ref_keys]
    score_array= np.array(score_array)
    
    maxl= np.argmax(score_array,axis= 1)
    maxl= np.array(ref_keys)[maxl]
    
    maxs= np.max(score_array,axis= 1)
    maxs= maxs < lb
    
    maxl[maxs]= out_code
    #print(maxl)
    return maxl


In [80]:
Nwindows= 100
Wsizes= 100
chrom= 1
wst= np.random.randint(0,genotype.shape[1] - Wsizes,size= Nwindows)
wst= np.linspace(0,genotype.shape[1] - Wsizes,Nwindows,dtype= int)
SequenceStore= {
    chrom: {summary.POS[st]: genotype[:,st:(st+Wsizes-1)] for st in wst}
}

Out= {chrom: {summary.POS[st]: summary.POS[st+ Wsizes - 1]for st in wst}}


In [81]:
from IPython.display import clear_output
from sklearn.metrics import pairwise_distances

pca_qtl= 0.2
ncomps= 5
clsize= 15
Bandwidth_split= 20
out_code= -1
metric= 'euclidean'
lb= 1e-3
cl_samp= 50

Geneo= admx_lib
Geneo_order= list(Geneo.keys())
ref_order= list(refs_lib.keys())

Whose= list(range(sum([len(x) for x in Geneo.values()])))
Sup_labels= list(np.repeat(Geneo_order,[len(Geneo[x]) for x in Geneo_order]))

### Define parameters and libraries of analyses.

Results = {x:recursively_default_dict() for x in SequenceStore.keys()}

###
###
clov_pca= PCA(n_components=ncomps, whiten=False,svd_solver='randomized').fit(preProc_Clover)
data_clov= clov_pca.transform(preProc_Clover)



ref_gens= kde_gen_dict(data_clov,label_keep)

dists_dict= {z:{y:[] for y in ref_gens.keys()} for z in ref_gens.keys()}

for CHR in SequenceStore.keys():
    print('going on CHR: '+ str(CHR))
    for c in SequenceStore[CHR].keys():

        ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
        Sequences= [SequenceStore[CHR][c][x] for x in Whose]
        Sequences= np.array(Sequences) 
        Sequences= np.nan_to_num(Sequences)
        
        clust_acc, ms_local= MS_get_norm(Sequences,refs_lib,ncomps= ncomps,clsize= clsize,Bandwidth_split= Bandwidth_split,
               pca_qtl= pca_qtl)
        
        mskeys= list(ms_local.keys())
        
        dist_array= [ms_local[g] for g in mskeys]
        dist_array= np.array(dist_array)
        qtl_dist= clov_pca.transform(dist_array)
        
        ## classify kde profiles.
        cluster_class= gen_class(qtl_dist,ref_gens,lb= lb)
        
        if len(list(set(cluster_class[cluster_class != out_code]))) <= 1:
            continue
        
        cluster_found= {z: [x for x in range(len(cluster_class)) if cluster_class[x] == z] for z in list(set(cluster_class)) if z != -1}
        
        for v,g in cluster_found.items():
            dist_foud= qtl_dist[g]
            if dist_foud.shape[0] > 1:
                dist_foud= np.mean(g,axis= 1)
            
            g= dist_foud
        
        cluster_found= {mskeys[z]:g for z,g in cluster_found.items()}
        cluster_keys= list(cluster_found.keys())
        lclust_gens= kde_gen_dict(Sequences,clust_acc)
        
        lclust_samp= {z:g.sample(cl_samp) for z,g in lclust_gens.items()}
        lclust_means= {z: np.mean(g,axis= 1) for z,g in lclust_samp.items()}
        
        hills= [lclust_means[z] for z in cluster_keys]
        hills= np.array(hills)
        hill_dists= pairwise_distances(hills,metric= metric)
        
        for idx in range(len(cluster_keys)):
            for idx1 in range(len(cluster_keys)):
                if idx != idx1:
                    cd1= cluster_keys[idx]
                    cd2= cluster_keys[idx1]
                    dists_dict[cd1][cd2].append(hill_dists[idx,idx1])


        

going on CHR: 1


In [111]:
from plotly import subplots

def plot_distances(dists_dict,gp,range_dists,height= 500,width= 900):
    Ncols= 1
    
    keys_get= sorted([v for v,g in dists_dict[gp].items() if len(g)])
    titles= ['cl: {}'.format(g) for g in keys_get]

    fig_subplots = subplots.make_subplots(rows= int(len(titles) / float(Ncols)) + (len(titles) % Ncols > 0), cols=Ncols,
                             subplot_titles=tuple(titles))
    
    for idx in range(len(titles)):
        print(idx)
        ref= keys_get[idx]
        pos1= int(float(idx) / Ncols) + 1
        pos2= idx - (pos1-1)*Ncols + 1

        title= titles[idx]
        
        data= dists_dict[gp][ref]
        data= np.array(data).reshape(-1,1)
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data)
        scor_dist= kde.score_samples(range_dists)
        scor_dist= np.exp(scor_dist)
        trace1= go.Scatter(
            y= scor_dist,
            x= range_dists.T[0],
            mode= 'markers',
            name= titles[idx]
        )
        
        fig_subplots.append_trace(trace1, pos1, pos2)
        
        fig_subplots['layout']['yaxis' + str(idx + 1)].update(title= 'L')
        fig_subplots['layout']['yaxis' + str(idx + 1)].update(range= [0,max(scor_dist) + max(scor_dist)/10])
        fig_subplots['layout']['xaxis' + str(idx + 1)].update(title= 'pca dist')

    layout = go.Layout(
        title= title,
    )

    fig= go.Figure(data=fig_subplots, layout=layout)
    
    fig['layout'].update(height= height,width= width)
    
    
    iplot(fig)


range_dists= np.linspace(0,3,50)
range_dists= range_dists.reshape(-1,1)
gp= 0

for gp in sorted(dists_dict.keys()):
    plot_distances(dists_dict,gp,range_dists,height= 500,width= 900)

0
1


0
1


0
1


In [None]:
### Compare Full data and inferred true distances:


In [None]:

def MAC_process(Construct,Out,Cl_store,refs_lib,Fam,Names= [],target_var= [],Dr_var= 'all',focus_subset= False,Focus= [],Dr_dim= 4,threshold= 0.1,Method= 'MeanShift'):

    Coordinates = [[[[CHR,bl,Out[CHR][bl],x] for x in Construct[CHR][bl].keys()] for bl in sorted(Construct[CHR].keys())] for CHR in sorted(Construct.keys())]
    Coordinates = [z for z in it.chain(*[y for y in it.chain([x for x in it.chain(*Coordinates)])])]


    Coordinates= np.array(Coordinates)
    

    Clover= [[[Construct[CHR][bl][x] for x in Construct[CHR][bl].keys()] for bl in sorted(Construct[CHR].keys())] for CHR in sorted(Construct.keys())]
    Clover= [z for z in it.chain(*[y for y in it.chain(*Clover)])]
    Clover= np.array(Clover)
    Clover.shape
    
    Membership=[]
    
    for CHR in sorted(Construct.keys()):
        for bl in sorted(Construct[CHR].keys()):
            
            Bls= sorted(list(Construct[CHR][bl].keys()))
            pVals= np.array([Construct[CHR][bl][y] for y in Bls])
            
            max_vals= np.amax(pVals,axis= 0)
            max_indx= np.argmax(pVals,axis= 0)
            
            inlier= [x for x in range(pVals.shape[1]) if max_vals[x] >= threshold]
            
            BL_select= list(set([max_indx[x] for x in inlier]))
            
            #print('clusters {} selected. {} %'.format(BL_select,len(BL_select)/float(len(Bls))))
            
            if not BL_select:
                Empty.append([CHR,bl])
                continue
            
            BL_select= { 
                x: pVals[x] for x in BL_select
                }
            
            Assignment= {
                    Bls[b]: [x for x in inlier if max_indx[x] == b] for b in BL_select.keys()
                }
            
            for cl in Bls:
                if cl not in BL_select.keys():
                    vector= ''
                else:
                    vector= '.'.join([str(x) for x in Assignment[cl]])
                
                Membership.append(vector)

    
    #Membership= np.array(Membership)
    
    Coordinates= pd.DataFrame(Coordinates,columns= ['chrom','start','end','bl'])
    Coordinates['members']= Membership
    

    from sklearn import preprocessing

    Clover = np.nan_to_num(Clover)
    preProc_Clover = Clover

    print('Clover shape: ', Clover.shape)

    Clover = preprocessing.scale(Clover,axis = 1)
    #

    print("Clover shape: ", Clover.shape)


    reefer= [g for g in it.chain(*[refs_lib[y] for y in sorted(refs_lib.keys())])]

    if not focus_subset:
        Subset= list(range(Clover.shape[1]))
    else:
        Subset= [Names.index(x) for x in Focus]

    ## apply pca to reference accessions, transform the rest.

    Dr_processes= ['target','focus_inc','all']

    if Dr_var not in Dr_processes:
        print('Dr_process selected: {}, Dr_var processes available: {}'.format(Dr_var,Dr_processes))
        Dr_var= 'target'

    print('focusing Dr on {}'.format(Dr_var))

    if Dr_var== 'target':
        variation_focus= [Names.index(Fam[x]) for x in it.chain(*[refs_lib[z] for z in target_var])]

    if Dr_var== 'focus_inc':
        variation_focus= [Names.index(x) for x in Focus]
        variation_focus.extend([Names.index(Fam[x]) for x in it.chain(*[refs_lib[z] for z in target_var])])

    if Dr_var== 'all':
        variation_focus= list(range(Clover.shape[1]))


    ### PCA
    pca = PCA(n_components=Dr_dim, whiten=False).fit(Clover[:,variation_focus].T)
    X_se = pca.transform(Clover[:,Subset].T)
    COMPS = pca.components_.T*np.sqrt(pca.explained_variance_)


    ###############################################################################
    ########################### PAINTING SHIT!! ###################################
    ###############################################################################

    ## 
    ## CLUSTER EIGENVALUES
    ##

    bandwidth = estimate_bandwidth(COMPS, quantile=0.1)
    if bandwidth==0:
        bandwidth = 0.1

    func_cl= Cl_store[Method]['Clusterfunc']
    func_kwargs= Cl_store[Method]['cluster_kwargs']


    Clusterfunck= func_cl(**func_kwargs)
    Clusterfunck.fit(COMPS)

    labels1 = Clusterfunck.labels_
    label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}


    ###############################################################################
    #### Average normalized likelihhod among clustered eigenvectors by haplotype #####
    ###############################################################################


    Cameo = []

    for cramp in sorted(label_select.keys()):
        Clamp = np.mean(preProc_Clover[label_select[cramp],:],axis = 0)
        Fry = [Clamp[x] for x in Subset]
        Cameo.append(Fry)

    Cameo = np.array(Cameo).T


    ###########################################################################
    ### cosine of the clustered eigenvectors with haplotype coordinates ######## DEPRECATED
    ###########################################################################

    #cos_threshold = .6
    #
    #
    #from numpy import dot
    #from numpy.linalg import norm
    #
    #SpaceY = recursively_default_dict()
    #
    #for g in label_select.keys():
    #    Green = COMPS[label_select[g],:]
    #    SpaceY[g] = [mean(dot(X_se[x],Green.T)/norm(Green,axis=1)/norm(X_se[x])) for x in range(X_se.shape[0])]
    #
    #Globe = np.array([SpaceY[x] for x in sorted(SpaceY.keys())]).T
    #
    #

    ######## Reducing the number of cluster profiles to print:
    new_labs= labels1

    return preProc_Clover, Cameo, Coordinates, COMPS, X_se, label_select, Subset, labels1




In [37]:
preProc_Clover.shape

(280, 130)

In [None]:


def KDE_pca(feats= [],Cameo= [],label_vector= [],Subset= [],Col_vec= [],height= 2000,width= 1000):
    
    Ncols= 2
    titles=['Global']
    titles.extend(['cluster ' + str(x + 1) for x in range(Cameo.shape[1])])
    titles= list(np.repeat(titles,2))
    print(titles)

    fig_pca_subplots = tools.make_subplots(rows= int(len(titles) / float(Ncols)) + (len(titles) % Ncols > 0), cols=Ncols,
                             subplot_titles=tuple(titles))
    
    #####
    for subp in range(len(titles)):
        #print(subp)
        
        pos1= int(float(subp) / Ncols) + 1

        pos2= subp % Ncols + 1
        
        n_plot= subp
        
        
        if subp >= 2:
            gradient= Cameo[:,pos1-2]

            trace= go.Scatter(
            x = feats[:,0],
            y = feats[:,pos2],
            mode= "markers",
            marker= {
                'color': gradient,
                'colorscale':'Viridis',
                'line': {'width': 0},
                'size': 6,
                'symbol': 'circle',
                "opacity": .6
            })

            fig_pca_subplots.append_trace(trace, row=pos1, col=pos2)

        else:
            coords= {z:[x for x in Subset if label_vector[x] == z] for z in list(set(label_vector))}

            for i in coords.keys():
                if coords[i]:
                    trace= go.Scatter(
                    x = feats[coords[i],0],
                    y = feats[coords[i],pos2],
                    mode= "markers",
                    name= str(i),
                    marker= {
                    'color': Col_vec[i],
                    'line': {'width': 0},
                    'size': 6,
                    'symbol': 'circle',
                    "opacity": .8})

                    fig_pca_subplots.append_trace(trace, row=pos1, col=pos2)

        fig_pca_subplots['layout']['yaxis' + str(n_plot + 1)].update(title='PC{}'.format(pos2+1))
        fig_pca_subplots['layout']['xaxis' + str(n_plot + 1)].update(title='PC1')
    
    fig_pca_subplots['layout'].update(height= height,width= width)
    
    #fig= go.Figure(data=fig_pca_subplots, layout=layout)
    iplot(fig_pca_subplots)
