In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

from random import shuffle

from random import sample
from sklearn.preprocessing import scale

In [8]:
genes_name= 'Genes_labs01,02,12_cp_overlap.txt'

Overlap_gen= recursively_default_dict()

Input= open(genes_name,'r')

for line in Input:
    line= line.split()
    if line[0] == 'ID':
        Names= line[1:]
    else:
        Overlap_gen[line[0]]= [float(x) for x in line[1:]]
    
Input.close()

Gene_names= [x for x in Overlap_gen.keys()]

print('number of genes: {}'.format(len(Overlap_gen)))

print('target comparisons: {}'.format(Names))

38796


['01', '02', '12', 'compound']

In [11]:
target= 'compound'
sumOver= ['01', '02', '12']

X= [np.sum([Overlap_gen[Gene_names[x]][Names.index(z)] for z in sumOver]) for x in range(len(Gene_names))]
#X= scale(X)

X_plot = np.linspace(min(X)-.5, max(X) + .5, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(np.array(X).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [] 
fig_roost_dens.append(go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'gen similarity',
                            line=dict(color='blue', width=2)))

layout= go.Layout(
    title= 'p-value overlap across genes. {} classes.'.format(target),
    xaxis= dict(
        title= 'scaled overlap'
    ),
    yaxis= dict(
        title= 'density'
    )
)

fig = go.Figure(data=fig_roost_dens,layout= layout)
iplot(fig)

**Fig. 1** distribution of overlap values from target column.

In [7]:
X= np.array(X).reshape(-1,1)
bandwidth = estimate_bandwidth(X, quantile=0.15, n_samples=len(X))

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True)
ms.fit(X)
labels = ms.labels_


labels1= {z:[x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels)) if z != -1}

cluster_IDs= {z:[Gene_names[x] for x in labels1[z]] for z in labels1.keys()}

fig_box= []

for mound in labels1.keys():
    fig_box.append(go.Box(
        y= [X[z][0] for z in labels1[mound]],
        name= 'group: {}, mean= {}'.format(mound,np.round(np.mean([X[z][0] for z in labels1[mound]]),2)),
        marker= dict(
            color= 'blue'
        )
    ))

layout= go.Layout(
    title= 'Overlap MS clusters Mean',
    orientation= 'h'
)


fig = go.Figure(data=fig_box,layout= layout)
iplot(fig)


In [9]:
Gene_fit = pd.read_csv('MSU7_blockMerge.txt',sep= '\t')


ID= 'Conserved'

In [10]:
target_cluster= 1

print(len(cluster_IDs[target_cluster]))


Group_MSU= Gene_fit[Gene_fit.ID.isin(cluster_IDs[target_cluster])]
Group_MSU= Group_MSU[~Group_MSU.Note.isin(['expressed protein','hypothetical protein'])]


agriGO_file= 'target_{}_agriGO.txt'.format(target_cluster)

Output= open(agriGO_file,'w')

for gen in cluster_IDs[target_cluster]:
    Output.write('LOC_' + gen + '\n')

Output.close()

12667


In [11]:
Gene_select= ['Os08g32870','Os04g57530','Os07g11020','Os06g04200','Os04g38680','Os03g59640','Os03g36540','Os04g38660','Os01g62920','Os06g10350']

gen_lib= {
    'Os06g10350': 'Osc1',
    'Os01g62920': 'qSH1',
    'Os04g38660': 'Bh4.1',
    'Os04g38680': 'Bh4.2',
    'Os03g59640': 'CHl1',
    'Os03g36540': 'CHl9',
    'Os06g04200': 'Waxy',
    'Os07g11020': 'rc',
    'Os04g57530': 'sh4',
    'Os04g39020': 'badh',
    'Os08g32870': 'badh2',
}

Guys= Group_MSU[Gene_fit.ID.isin(gen_lib.keys())]
print([gen_lib[x] for x in Guys.ID])
Guys['cName']= pd.Series(np.array([gen_lib[x] for x in Guys.ID]),index= Guys.index)
Guys['Overlap']= pd.Series(np.array([round(X[Gene_names.index(x)][0],3) for x in Guys.ID]),index= Guys.index)

#### print genes found
import os
filename= 'Gene_target_cluster' + str(target_cluster)+'.txt'
Guys.to_csv(filename,sep= '\t')


Guys

['badh', 'Waxy', 'rc']



Boolean Series key will be reindexed to match DataFrame index.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,tag,chr,start,end,ID,Name,Note,cName,Overlap
15340,23167309,4,23171426,23176369,Os04g39020,Os04g39020,aldehyde dehydrogenase%2C putative%2C expressed,badh,1.143
20298,1759774,6,1765622,1770656,Os06g04200,Os06g04200,starch synthase%2C putative%2C expressed,Waxy,0.622
24076,6054229,7,6062889,6069317,Os07g11020,Os07g11020,rc - bHLH transcription factor regulating proa...,rc,0.914
