In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it
import time

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth

from scipy.stats import invgamma 
from scipy.stats import beta

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

from IPython.display import clear_output

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


In [3]:
Home= 'CLfreq_one' + '/'

### Freqs

filename= Home + 'CLfreq_one_freqs.txt'

freqs_dict= recursively_default_dict()
freqs_matrix= []

Input= open(filename,'r')

for line in Input:
    line= line.split()
    
    freqs_matrix.append([float(line[x]) for x in range(3,len(line))])
    freqs_dict[int(line[0])][float(line[1])][float(line[2])]= [float(line[x]) for x in range(3,len(line))]
Input.close()

In [5]:
Across= list(it.chain(*freqs_matrix))


X_plot = np.linspace(0, 1, 1000)

freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(Across).reshape(-1,1))

log_dens = freq_kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'mRNA pVal',
                            line=dict(color='blue', width=2))]
##

layout= go.Layout(
    title= 'allele frequency distribution across clusters',
    yaxis= dict(
        title= 'density'
    ),
    xaxis= dict(
        title= 'frequency'
    )
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)

In [18]:
Home= 'CLfreq_one' + '/'

### Freqs

filename= Home + 'CLfreq_one_KDE.txt'

kde_dict= recursively_default_dict()
kde_matrix= []

Input= open(filename,'r')

for line in Input:
    line= line.split()
    
    kde_matrix.append([float(line[x]) for x in range(3,len(line))])
    kde_dict[int(line[0])][float(line[1])][float(line[2])]= [float(line[x]) for x in range(3,len(line))]
Input.close()

kde_matrix= np.array(kde_matrix)

In [34]:
from structure_tools.StructE_tools import read_refs, FAMread

fam= Home + 'NG_001.fam'
admx= Home + 'admx_CORE.txt'
ref= Home + 'refs_CORE.txt'


orderCore= pd.read_csv(Home + 'Order_core.txt',sep= '\t')
Fam= FAMread(fam)

refs_lib, Parents, absent_refs  = read_refs(ref,Fam)

admx_lib, Crossed, absent_admx  = read_refs(admx,Fam)

    
admx_lib.update(refs_lib)

Geneo = admx_lib

Geneo_order= list(Geneo.keys())

Subset= list(range(len(Whose)))
Whose= [z for z in it.chain(*[Geneo[x] for x in Geneo_order])]

code_reset={
    1:0,
    3:1,
    4:2,
    5:3,
    2:4
}

label_vector= [y for y in it.chain(*[[code_reset[z]]*len(Geneo[z]) for z in Geneo_order])]
Names= [Fam[x] for x in Whose]

In [33]:
Geneo_order

[2, 5, 1, 3, 4]

In [28]:
from sklearn.cluster import KMeans
n_comp= 5

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(kde_matrix.T)
features = pca.transform(kde_matrix.T)
COMPS= pca.components_.T

kmeans = KMeans(n_clusters=10, random_state=0).fit(COMPS)
labels1 = kmeans.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

Cameo = []

for cramp in sorted(label_select.keys()):
    Clamp = np.mean(kde_matrix[label_select[cramp],:],axis = 0)
    Fry = [Clamp[x] for x in Subset]
    Cameo.append(Fry)

Cameo = np.array(Cameo).T

print(Cameo.shape)

(948, 10)


In [39]:
from plotly import tools
from structure_tools.mstutorial_tools import KDE_pca

KDE_pca(0,feats= features,Cameo= Cameo,label_vector= label_vector,Subset= Subset,height= 3000)

['Global', 'Global', 'cluster 1', 'cluster 1', 'cluster 2', 'cluster 2', 'cluster 3', 'cluster 3', 'cluster 4', 'cluster 4', 'cluster 5', 'cluster 5', 'cluster 6', 'cluster 6', 'cluster 7', 'cluster 7', 'cluster 8', 'cluster 8', 'cluster 9', 'cluster 9', 'cluster 10', 'cluster 10']
This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]   
[ (2,1) x3,y3 ]     [ (2,2) x4,y4 ]   
[ (3,1) x5,y5 ]     [ (3,2) x6,y6 ]   
[ (4,1) x7,y7 ]     [ (4,2) x8,y8 ]   
[ (5,1) x9,y9 ]     [ (5,2) x10,y10 ] 
[ (6,1) x11,y11 ]   [ (6,2) x12,y12 ] 
[ (7,1) x13,y13 ]   [ (7,2) x14,y14 ] 
[ (8,1) x15,y15 ]   [ (8,2) x16,y16 ] 
[ (9,1) x17,y17 ]   [ (9,2) x18,y18 ] 
[ (10,1) x19,y19 ]  [ (10,2) x20,y20 ]
[ (11,1) x21,y21 ]  [ (11,2) x22,y22 ]

