In [17]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
from IPython.display import clear_output
import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## vcf analysis
Jupyter notebook for the local analysis of genetic data stored in .vcf format.

Perform analysis of structure across data set, followed by a more detailed study of variation across local genomic windows.

### Input

In [18]:
from structure_tools.vcf_geno_tools import simple_read_vcf

vcf_file= 'data_cleanRefs_Alien01_Admx.vcf'

genotype, summary, info_save= simple_read_vcf(vcf_file,row_info= 5,header_info= 9,phased= True)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))

Number of markers: 40000
Number of individuals: 130


In [19]:
summary.head()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,1,19,1,A,T,.,PASS,.,GT:AD:DP
1,1,57,2,A,T,.,PASS,.,GT:AD:DP
2,1,62,3,A,T,.,PASS,.,GT:AD:DP
3,1,163,4,A,T,.,PASS,.,GT:AD:DP
4,1,200,5,A,T,.,PASS,.,GT:AD:DP


### Global variation

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These will later be used for supervised analysis.

In [20]:
from structure_tools.Tutorial_subplots import plot_global_pca

## Perform PCA
n_comp= 3
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(genotype)

In [21]:
## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=45)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

###

In [22]:
###
plot_global_pca(feats,label_select,PCA_color_ref,title= 'global_pca',height= 500,width= 950)

### AMOVA at local windows

Between population variance at local windows. 

#### I. Local windows

`Nwindows`: number of windows

`Wsizes`: size of each window (in feature number).


In [23]:
SequenceStore= {}

Nwindows= 100
Wsizes= 300
chrom= 1

wst= np.random.randint(0,genotype.shape[1] - Wsizes,size= Nwindows)
wst= np.linspace(0,genotype.shape[1] - Wsizes,Nwindows,dtype= int)
SequenceStore= {
    chrom: {summary.POS[st]: genotype[:,st:(st+Wsizes-1)] for st in wst}
}

Out= {chrom: {summary.POS[st]: summary.POS[st+ Wsizes - 1]for st in wst}}



#### AMOVA

Calculating local AMOVA. Chose which groups to calculate AMOVA between at each window using the labels in the first plot in `ref_gps` (above).

In [24]:
ref_gps= [0,1,2]


In [25]:
## AMOVA parameters.
supervised= True

Bandwidth_split= 30 # grid split for kde 
KDE_comps= 4 # PCA components to retain
clsize= 15 # minimum cluster size to retain during ms clustering.
amova= True # whether to calculate amova.


In [26]:
from structure_tools.StructE_tools import findPhiPT, Structure_profiles, Distance_profiles

from structure_tools.AMOVA_func import amova_cofactor, AMOVA_FM42
from structure_tools.mstutorial_tools import Windows_KDE_amova

### Perform Distance and association analysis on the data sets generated

refs_lib= {v:g for v,g in label_select.items() if v in ref_gps}
admx_lib= {v:g for v,g in label_select.items() if v not in ref_gps}
admx_lib.update(refs_lib)
import itertools as it

Results, Construct, PC_var= Windows_KDE_amova(SequenceStore,
                                              admx_lib,
                                              refs_lib,
                                              supervised= True,
                                              amova= True,
                                              ncomps= KDE_comps,
                                              clsize= clsize,
                                              Bandwidth_split= Bandwidth_split)

chr 1, where: 1985001, supervised: True, n clusters: 3
old: ; jaccard: -0.03415741106295015; PCA euc: 0.3099846090045286; nHam: 0.04746706430138808


In [27]:
AMOVA_stats= [[[Chr,wind,*Results[Chr][wind]] for wind in Results[Chr].keys()] for Chr in Results.keys()]
AMOVA_stats= np.array([y for y in it.chain(*AMOVA_stats)])

Names= ['updt jaccard','updt euc','updt hamming']

fig_data= [go.Scatter(
    x= AMOVA_stats[:,1],
    y= AMOVA_stats[:,x],
    mode= 'markers',
    name= Names[x - 3]
) for x in range(3,6)]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='Windows')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

## Cluster Focus

In this section we identify clusters at local windows. For each cluster we will extract the _p_-value of every accession given that cluster. This _p_-value is calculated using the cluster kernel density estimation in PCA feature space (_MS profile_).

Cluster profiles are captured as normalized individual cdfs under specific cluster kernel density estimates in feature space.

The identification of clusters in feature space can be done using any of the methods available in the sklearn package. The method and parameters are defined in the `CL_store` dictionary below. 

First however we define which accessions to use in identifying clusters and extracting cluster profiles. 



#### i. Accession focus

Once again we can chose to use only a subset of accessions with which to identify clusters. To select these accessions use the labels.


In [28]:

select_refs= [0,1,2,4]
label_vector= [[len(select_refs),labels1[x]][int(labels1[x] in select_refs)] for x in range(genotype.shape[0])]

Whose= list(range(genotype.shape[0]))


In [29]:
Names=['id' + str(x) for x in range(len(Whose))]
Fam= {
    Names[x]:x for x in range(len(Names))
}

Fam.update({
    x:Names[x] for x in range(len(Names))
})

###
Dr_dim= 3

###
focus_subset= False
Geneo= admx_lib
Focus_group= 0

Focus= [Names[x] for x in Geneo[Focus_group]]

###
Dr_var= 'all'
target_var= [0]

##

#### ii. Local window custer profiles

In [30]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

Method= 'MeanShift'

Cl_store= {
    'MeanShift':{
        'Clusterfunc': MeanShift,
        'cluster_kwargs': {
            'bin_seeding': False,
            'cluster_all': True,
            'min_bin_freq': 5
        }
    },
    'DBscan':{
        'Clusterfunc': DBSCAN,
        'cluster_kwargs': {
            'min_samples': 15
        }
    },
    'Ward':{
        'Clusterfunc': AgglomerativeClustering,
        'cluster_kwargs': {
            'linkage': 'ward',
            'n_clusters': 4
        }
    },
    'Kmeans':{
        'Clusterfunc': KMeans,
        'cluster_kwargs': {
            'random_state': 0,
            'n_clusters': 3
        }
    }
}

In [31]:
from structure_tools.mstutorial_tools import MAC_process

preProc_Clover, Cameo, Coordinates, COMPS, X_se, label_comps, Subset, labels_comp= MAC_process(Construct,
                                                                             Out,
                                                                             Cl_store,
                                                                             refs_lib,
                                                                             Fam,
                                                                             Names= Names,
                                                                             target_var= target_var,
                                                                             Dr_var= Dr_var,
                                                                             focus_subset= focus_subset,
                                                                             Focus= Focus,
                                                                             Dr_dim= Dr_dim,
                                                                             Method= Method)


Clover shape:  (211, 130)
Clover shape:  (211, 130)
focusing Dr on all


#### iii. cluster visualisation.

The function `MAC_process` performs dimensionality reduction on the _MS profiles_ extracted across local windows. _MS profiles_ are clustered (PCA + Mean Shift). 

Feature-wise, MS profiles contain the normalized probability of individual accessions towards window-level genetic clusters. We average MS_profiles by group. 

Below, the principal coordinates of Dr on the entire data sets are used to convey the ID of _MS profile_ features.

First the labels from the full set clustering are replotted.


In [33]:
from plotly import tools
from structure_tools.mstutorial_tools import KDE_pca

KDE_pca(feats= feats,Cameo= Cameo,label_vector= labels1,Subset= Subset, 
       Col_vec= PCA_color_ref)

['Global', 'Global', 'cluster 1', 'cluster 1', 'cluster 2', 'cluster 2', 'cluster 3', 'cluster 3', 'cluster 4', 'cluster 4']


## Reconstruct actual tree. 

We find that _MS profile_ groups correspond to specific clusters that show up in global Dr.

We also find that other _MS profile_ groups encompass more than one group of observations.

This is indicative of internal structure: At some local windows observations from two or more groups are found to cluster together. This can have several interpretations depending on your data. We are interested in capturing this data. 

We are going to focus on the clusters observed in the global Dr.

We are going to calculate distances between these clusters at windows where all three can be identified. We then compare these local distances to the global estimates calculating the whole data set.

The list `cluster_include` below determines which MS_profiles we chose to target. 

**Method** 

We will use the _MS profile_ groups displayed below as trainning sets. _MS profiles_ are stored in array `preProc_Clover` groups indexed in `label_comps`. This information is used to classify clusters identified at local windows. 

**cluster classification**

The array `preProc_Clover` is reduced and the kd of target (cluster_include) MS profile groups is estimated in feature space. 

Maximum likelihood classificaiton of Local _MS profiles_  is performed, allowing for outliers (max(L) < threshold). Outliers are discarded.



In [34]:
cluster_include= [0,2,3]

comp_label_keep= {z:g for z,g in label_comps.items() if z in cluster_include}

In [35]:
Nwindows= 80 # Number of windows
Wsizes= 100 # sizes in number of features
chrom= 1
wst= np.random.randint(0,genotype.shape[1] - Wsizes,size= Nwindows)
wst= np.linspace(0,genotype.shape[1] - Wsizes,Nwindows,dtype= int)
SequenceStore= {
    chrom: {summary.POS[st]: genotype[:,st:(st+Wsizes-1)] for st in wst}
}

Out= {chrom: {summary.POS[st]: summary.POS[st+ Wsizes - 1]for st in wst}}


In [36]:
from structure_tools.MS_target_tools import (
    MS_get_norm, kde_gen_dict, 
    gen_class, clustClass
)




In [37]:
from IPython.display import clear_output
from sklearn.metrics import pairwise_distances

pca_qtl= 0.2
ncomps= 2
clsize= 15
Bandwidth_split= 20
out_code= -1
metric= 'euclidean'
lb= 1e-3
cl_samp= 50

Geneo= admx_lib
Geneo_order= list(Geneo.keys())
ref_order= list(refs_lib.keys())

Whose= list(range(sum([len(x) for x in Geneo.values()])))
Sup_labels= list(np.repeat(Geneo_order,[len(Geneo[x]) for x in Geneo_order]))

### Define parameters and libraries of analyses.

Results = {x:recursively_default_dict() for x in SequenceStore.keys()}

###
###
clov_pca= PCA(n_components=ncomps, whiten=False,svd_solver='randomized').fit(preProc_Clover)
data_clov= clov_pca.transform(preProc_Clover)


ref_gens, ref_stats= kde_gen_dict(data_clov,comp_label_keep)
dists_dict= {z:{y:[] for y in ref_gens.keys()} for z in ref_gens.keys()}


for CHR in SequenceStore.keys():
    print('going on CHR: '+ str(CHR))
    for c in SequenceStore[CHR].keys():

        ### PCA.
        Sequences= [SequenceStore[CHR][c][x] for x in Whose]
        Sequences= np.array(Sequences) 
        Sequences= np.nan_to_num(Sequences)
        
        clust_acc, ms_local, feat_seq= MS_get_norm(Sequences,refs_lib,ncomps= ncomps,clsize= clsize,Bandwidth_split= Bandwidth_split,
               pca_qtl= pca_qtl)
        
        mskeys= list(ms_local.keys())
        
        #print(ms_local)
        cluster_found= clustClass(ms_local,clov_pca,ref_gens,gen_stats= ref_stats,out_code= out_code)
        cluster_found= {g[0]:z for z,g in cluster_found.items()}
        
        if len(cluster_found) > 2:
            print(c)
        
        clust_acc= {z: g for z,g in clust_acc.items() if z in cluster_found.keys()}
        clust_acc= {cluster_found[z]:g for z,g in clust_acc.items()}
        
        if not cluster_found:
            continue
        
        #cluster_found= {mskeys[z]:g for z,g in cluster_found.items()}
        cluster_keys= list(clust_acc.keys())
        
        lclust_gens, lclust_stats= kde_gen_dict(feat_seq,clust_acc)
        
        lclust_samp= {z:g.sample(cl_samp) for z,g in lclust_gens.items()}
        lclust_means= {z: np.mean(g,axis= 0) for z,g in lclust_samp.items()}
        
        #print([x.shape for x in lclust_samp.values()])
        hills= [lclust_means[z] for z in cluster_keys]
        hills= np.array(hills)
        hill_dists= pairwise_distances(hills,metric= metric)
        
        for idx in range(len(cluster_keys)):
            for idx1 in range(len(cluster_keys)):
                if idx != idx1:
                    cd1= cluster_keys[idx]
                    cd2= cluster_keys[idx1]
                    dists_dict[cd1][cd2].append(hill_dists[idx,idx1])



going on CHR: 1
605924
631370
681805
707003
732600
757862


In [38]:
from plotly import subplots

def D1_kdegen(dists_dict,kernel='gaussian', bandwidth=0.05):
    '''
    '''
    gen_dict= {}
    for gp,data in dists_dict.items():
        
        if not data:
            gen_dict[gp]= data
            continue
        data= np.array(data).reshape(-1,1)
        
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(data)
        gen_dict[gp]= kde

    return gen_dict



def plot_distances(dists_dict,gp,range_dists,height= 500,width= 900):
    Ncols= 1
    
    keys_get= sorted([v for v,g in dists_dict[gp].items() if len(g)])
    titles= ['cl: {}'.format(g) for g in keys_get]
    print(titles)
    
    dist_gens= {}
    
    if titles:
        fig_subplots = subplots.make_subplots(rows= int(len(titles) / float(Ncols)) + (len(titles) % Ncols > 0), cols=Ncols,
                                 subplot_titles=tuple(titles))

        for idx in range(len(titles)):
            print(idx)
            ref= keys_get[idx]
            pos1= int(float(idx) / Ncols) + 1
            pos2= idx - (pos1-1)*Ncols + 1

            title= titles[idx]

            data= dists_dict[gp][ref]
            data= np.array(data).reshape(-1,1)
            kde = KernelDensity(kernel='gaussian', bandwidth=.5).fit(data)
            dist_gens[ref]= kde
            
            scor_dist= kde.score_samples(range_dists)
            scor_dist= np.exp(scor_dist)
            trace1= go.Scatter(
                y= scor_dist,
                x= range_dists.T[0],
                mode= 'markers',
                name= titles[idx]
            )

            fig_subplots.append_trace(trace1, pos1, pos2)

            fig_subplots['layout']['yaxis' + str(idx + 1)].update(title= 'L')
            fig_subplots['layout']['yaxis' + str(idx + 1)].update(range= [0,max(scor_dist) + max(scor_dist)/10])
            fig_subplots['layout']['xaxis' + str(idx + 1)].update(title= 'pca dist')

        layout = go.Layout(
            title= title,
        )

        fig= go.Figure(data=fig_subplots, layout=layout)

        fig['layout'].update(height= height,width= width)

        iplot(fig)
        
        return dist_gens



range_dists= np.linspace(0,12,100)
range_dists= range_dists.reshape(-1,1)
gp= 0

dist_gens= {}

for gp in sorted(dists_dict.keys()):
    gp_sub_gens= plot_distances(dists_dict,gp,range_dists,height= 500,width= 900)
    dist_gens[gp]= gp_sub_gens

['cl: 2', 'cl: 3']
0
1


['cl: 0', 'cl: 3']
0
1


['cl: 0', 'cl: 2']
0
1


In [41]:
## distances between clusters in global PCA
dimN= 2

gp_combs=it.combinations(cluster_include,2)
gp_combs= list(gp_combs)
gl_coords= {z: feats[g,:dimN] for z,g in label_select.items()}

gl_dists= {z: pairwise_distances(gl_coords[z[0]],gl_coords[z[1]],metric= metric) for z in gp_combs}

rec_coords= {z: dist_gens[z[0]][z[1]].sample(50) for z in gp_combs}



In [42]:
lclust_samp.keys()

dict_keys([0])

In [44]:
fig= [go.Box(
    y= gl_dists[x].reshape(1,-1)[0],
    name= 'dist {}'.format(x)
) for x in gp_combs]

layout= go.Layout()

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)

fig= [go.Box(
    y= rec_coords[x].reshape(1,-1)[0],
    name= 'dist {}'.format(x)
) for x in gp_combs]

layout= go.Layout()

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)

In [45]:
### Impute cluster

In [46]:
gp_combs

[(0, 2), (0, 3), (2, 3)]

In [47]:
print('full data set shape: {}'.format(genotype.shape))

nan_n= 1

xnan= np.random.randint(0,genotype.shape[1],size= nan_n)
ynan= np.random.randint(0,genotype.shape[0],size= nan_n)

nan_coords= [ynan,xnan]
nan_coords= np.array(nan_coords).T

print(nan_coords)


full data set shape: (130, 40000)
[[   47 12979]]


In [48]:
nan_idx= 0

nan_obs= nan_coords[nan_idx]
nan_obs= [43,16000]
nan_acc= nan_obs[0]
nan_pos= nan_obs[1]

wind_sizes= 100
Nreps= 400
ncomps= 5
dimN= 2
metric= 'euclidean'


In [49]:
local_l= genotype[:,(nan_pos-int(wind_sizes/2)):(nan_pos+int(wind_sizes/2))]
#local_l= SequenceStore[1]['757862']
coords= {z:[x for x in range(len(label_vector)) if label_vector[x] == z] for z in list(set(label_vector))}

pca2 = PCA(n_components=ncomps, whiten=False,svd_solver='randomized')
featl= pca2.fit_transform(local_l)

figwl= [go.Scatter(
    x= feat_seq[coords[i],0],
    y= feat_seq[coords[i],1],
    mode= 'markers',
    name= str(i)
) for i in coords.keys()]

figwl.append(go.Scatter(
    mode='markers',
    x=[feat_seq[nan_acc,0]],
    y=[feat_seq[nan_acc,1]],
    marker=dict(
        color='rgba(135, 206, 250, 0)',
        size=25,
        opacity= 1,
        line=dict(
            color='red',
            width=5
        )
    ),
    showlegend=False
))

layout= go.Layout()

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [50]:
### Apply to coordinate.

In [51]:
dists_dict.keys()

dict_keys([0, 2, 3])

In [52]:
lb= 1e-2

clust_acc, ms_local, feat_seq= MS_get_norm(local_l,refs_lib,ncomps= ncomps,clsize= clsize,Bandwidth_split= Bandwidth_split,
       pca_qtl= pca_qtl)

mskeys= list(ms_local.keys())

#print(ms_local)
cluster_found= clustClass(ms_local,clov_pca,ref_gens,gen_stats= ref_stats,out_code= out_code)

cluster_found= {g[0]:z for z,g in cluster_found.items()}

clust_acc= {z: g for z,g in clust_acc.items() if z in cluster_found.keys()}
clust_acc= {cluster_found[z]:g for z,g in clust_acc.items()}

#cluster_found= {mskeys[z]:g for z,g in cluster_found.items()}
cluster_keys= list(clust_acc.keys())

lclust_gens, lclust_stats= kde_gen_dict(feat_seq,clust_acc)

lclust_samp= {z:g.sample(cl_samp) for z,g in lclust_gens.items()}
lclust_means= {z: np.mean(g,axis= 0) for z,g in lclust_samp.items()}

In [53]:
from impute_tools.impute_tools import get_bg_grid


P= 60
dimN= 2
expand= 3

Quanted_set= np.array(featl) * expand

background= get_bg_grid(Quanted_set, P= P, dimN= dimN)

In [54]:

def comb_score(background,lclust_samp= {},dists_gens= {},select_missing= 0,dimN= 2, metric= "euclidean"):
    dist_refs= {}
    dist_refs= {
        z: pairwise_distances(background,g[:,:dimN],metric= metric) for z,g in lclust_samp.items()
    }
    
    dist_refMeans= {z: np.mean(g,axis= 1) for z,g in dist_refs.items()}
    
    select_gens= {}

    for gp in lclust_samp.keys():
        g= dists_gens[gp]
        
        if g[select_missing]:
            select_gens[gp]= g[select_missing]
    
    ##
    bg_score= {z: g.score_samples(dist_refMeans[z].reshape(-1,1)) for z,g in select_gens.items()}
    
    bg_scores= np.array(list(bg_score.values()))
    bg_scores= np.exp(bg_scores)
    bg_scof= np.prod(bg_scores,axis= 0)
    ##
    return bg_scof

##
dists_gens= {z:D1_kdegen(g) for z,g in dists_dict.items() if z in lclust_samp.keys()}

##

select_missing= 2

bg_scof= comb_score(background,lclust_samp= lclust_samp,dists_gens= dists_gens,
                    select_missing= select_missing,dimN= dimN, metric= metric)

In [55]:


figwl= [go.Scatter(
    mode='markers',
    x=background[:,0],
    y=background[:,1],
    marker= {
    'color':bg_scof,
    'colorbar': go.scatter.marker.ColorBar(
        title= 'ColorBar'
    ),
    'colorscale':'Viridis',
    'line': {'width': 0},
    'size': 15,
    'symbol': 'circle',
  "opacity": 1
  }
)]

layout= go.Layout(
    height= 600,
    width= 600
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [37]:
from impute_tools.impute_tools import (
    nBg_MS, nBg_grid,
    gridWalk
)


P= 20
dimN= 2
N_samps= P**dimN
dist_comps= 10
Bandwidth_split = 30
kernel= 'gaussian'



select_missing= 0

BG_func= comb_score
BG_args= {
    'lclust_samp': lclust_samp,
    'dists_gens': dists_gens,
    'select_missing': 0,
    'dimN': dimN, 
    'metric': metric
}


granted, grid_likes= gridWalk(featl,dist_ref,BG_func, BG_args= BG_args, std_gp_use= std_gp_use,
            P= P,
            dimN= dimN,
            N_samps= N_samps,
            dist_comps= dist_comps,
            Bandwidth_split = Bandwidth_split,
            metric= metric,
            kernel= kernel)


NameError: name 'dist_ref' is not defined

In [None]:
from plotly import tools

title= 'coords'
fig_subplots = tools.make_subplots(rows=1, cols=2,subplot_titles=tuple([title]*2))

for trace in figwl:
    fig_subplots.append_trace(trace, 1, 1)
    

trace= go.Scatter(
    x= granted[:,0],
    y= granted[:,1],
    #z= grid_likes,
    mode= 'markers',
    marker= {
        'color':grid_likes,
        'colorbar': go.scatter.marker.ColorBar(
            title= 'ColorBar'
        ),
        'colorscale':'Viridis',
        'line': {'width': 0},
        'size': 5,
        'symbol': 'circle',
      "opacity": 1
      }
)

fig_subplots.append_trace(trace, 1,2)

iplot(fig_subplots)