In [1]:
import pandas as pd
import numpy as np
import os
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

import collections
init_notebook_mode(connected=True)

By providing the flag `--Varprint`, the script `Kernel_mPLib3D_FM36_Galaxy.py` outputs the variance component ratios captured at each window. The amount of variance captured at each step of the analysis can be important in understanding the final classification output. At each window, the capacity of KDE to capture local patterns of structure is dependent on how much of that variance is reflected in the dimensions retained for classification.

In this notebook we look at the variance captured in a run of analyses conducted on data from the 3,000 Rice Genomes.

The genome of rice is composed of twelve chromosomes. Data on each local PCA was stored by chromosome. We begin by reading these files and concatenating them into one pandas data frame. We then look at the distribution of total variance retained by analyses across the genome.

In [2]:
CHR_var= {}

chromosomes= list(range(1,13))

for Chr in chromosomes:
    
    filename= 'Blocks_ExVAR_st3_CHR' + str(Chr).zfill(2) + '.txt'
    
    df= pd.read_csv(filename, sep= '\t')
    
    CHR_var[Chr] = df

PCvar= pd.concat([frame for frame in CHR_var.values()])

PCvar.head()

Unnamed: 0,CHR,IN,PC1,PC2,PC3,PC4,PC5
0,1,39452674,0.63015,0.17319,0.08543,0.05533,0.01262
1,1,17825795,0.72818,0.15309,0.02358,0.02092,0.01209
2,1,35389445,0.55008,0.18217,0.15606,0.05694,0.01418
3,1,40200132,0.78771,0.07042,0.05938,0.01461,0.01114
4,1,10092551,0.37958,0.29719,0.1682,0.03484,0.01162


In [3]:
n_comp= PCvar.shape[1] - 2

PCvar['PCsum']= PCvar[['PC' + str(x + 1) for x in range(n_comp)]].sum(axis= 1)

PCvar.head()

Unnamed: 0,CHR,IN,PC1,PC2,PC3,PC4,PC5,PCsum
0,1,39452674,0.63015,0.17319,0.08543,0.05533,0.01262,0.95672
1,1,17825795,0.72818,0.15309,0.02358,0.02092,0.01209,0.93786
2,1,35389445,0.55008,0.18217,0.15606,0.05694,0.01418,0.95943
3,1,40200132,0.78771,0.07042,0.05938,0.01461,0.01114,0.94326
4,1,10092551,0.37958,0.29719,0.1682,0.03484,0.01162,0.89143


In [4]:
from sklearn.neighbors import KernelDensity
from sklearn.cluster import estimate_bandwidth


total_var= PCvar['PCsum']

X_plot = np.linspace(-0.1, 1.1, 100)

kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(np.array(total_var).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'variance captured',
                            line=dict(color='blue', width=2))]
##


layout= go.Layout(
    title= 'ncomp: {}'.format(n_comp),
    yaxis= dict(
        title= 'density'
    ),
    xaxis= dict(
        title= 'variance explained'
    )
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)


In [8]:
how_much= .8

how_many= sum(total_var >= how_much) / float(len(total_var))
print('% data sets with above {} var explained: {}'.format(how_much*100,how_many))

print('mean: {}; sd: {}'.format(round(np.mean(total_var),3),round(np.std(total_var),3)))

% data sets with above 80.0 var explained: 0.8950057854981638
mean: 0.894; sd: 0.076


In [7]:

trace1 = [go.Box(
    y=total_var
)]

layout= go.Layout(
    title= 'ncomp: {}'.format(n_comp),
    yaxis= dict(
        title= 'density'
    ),
    xaxis= dict(
        title= 'variance explained'
    )
)

fig= go.Figure(data= trace1,layout= layout)
iplot(fig)