From 760d1f7bd96c3eb6cc863ae65df88f6083bb2ab8 Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Fri, 19 Aug 2016 00:51:11 -0400 Subject: [PATCH 1/5] Converted to matplotlib. Added KM and ScatterPair functions. --- survivalnet/analysis/Visualize.py | 516 +++++++++++++++++++----------- 1 file changed, 330 insertions(+), 186 deletions(-) diff --git a/survivalnet/analysis/Visualize.py b/survivalnet/analysis/Visualize.py index c3a2603..54a0d44 100644 --- a/survivalnet/analysis/Visualize.py +++ b/survivalnet/analysis/Visualize.py @@ -1,15 +1,41 @@ +from lifelines import KaplanMeierFitter +import matplotlib.pyplot as plt import numpy as np -import plotly as py -import plotly.graph_objs as go +from statsmodels.nonparametric.smoothers_lowess import lowess +from textwrap import wrap from . import RiskCohort +from . import RiskCluster # define colors for positive risk (red) and negative risk (blue) -Red = 'rgba(222,45,38,0.8)' -Blue = 'rgb(49,130,189)' - - -def Visualize(Model, Features, Symbols, N=30): +REDFACE = '#DE2D26' +BLUEFACE = '#3182BD' +REDEDGE = '#DE2D26' +BLUEEDGE = '#3182BD' +MEDIAN = '#000000' +WHISKER = '#AAAAAA' +POINTS = '#000000' +GRID = '#BBBBBB' + +# layout constants general +WRAP = 20 # number of characters for text wrapping +SPACING = 0.2 # margin + +# layout constants for boxplot +BOX_FH = 4 # boxplot figure width +BOX_FW = 8 # boxplot figure height +JITTER = 0.08 + +# layout constants for pairwise feature plot +PAIR_FW = 10 + +# layout constants for survival plot +SURV_FW = 6 +SURV_FH = 6 + + +def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, + GeneSet=False, N=30, Tau=0.05, Path=None): """ Generate visualizations of risk profiles. Backpropagation is used to @@ -46,26 +72,39 @@ def Visualize(Model, Features, Symbols, N=30): Corrected[i] = Prefix[i] + Suffix[i] # generate risk derivative profiles for cohort - Gradients = RiskCohort(Model, Features) - - # generate ranked bar chart - RankedBar(Gradients, Symbols, N) + Gradients = RiskCohort(Model, Normalized) # generate ranked box plot series - RankedBox(Gradients, Symbols, N) + RBFig = RankedBox(Gradients, Symbols, N) # generate paired scatter plot - PairScatter(Gradients, Symbols, N) + PSFig = PairScatter(Gradients, Symbols, N) + + # generate cluster plot + CFig = RiskCluster(Gradients, Raw, Symbols, N, Tau) + + # generate Kaplan-Meier plots for individual features + KMFigs, KMNames = KMPlots(Raw, Symbols, Survival, Censored, N) + + # save figures + if Path is not None: + + # save standard figures + RBFig.savefig(Path + 'RankedBox.pdf') + PSFig.savefig(Path + 'PairedScatter.pdf') + CFig.savefig(Path + 'Heatmap.pdf') + for i, Figure in enumerate(KMFigs): + Figure.savefig(Path + 'KM.' + KMNames[i] + '.pdf') -def RankedBar(Gradients, Symbols, N=30): +def RankedBox(Gradients, Symbols, N=30): """ - Generates bar chart of feature gradients ranked by absolute magnitude. + Generates boxplot series of feature gradients ranked by absolute magnitude. Parameters: ---------- - Risk_Gradients: numpy matrix + Gradients: numpy matrix a matrix containing feature weights. Symbols: numpy nd array @@ -74,44 +113,74 @@ def RankedBar(Gradients, Symbols, N=30): N: integer value number of featurs to display in barchart. + Returns + ------- + Figure : figure handle + Handle to figure used for saving image to disk i.e. + Figure.savefig('heatmap.pdf') """ - # calculate means, standard deviations if multiple sample provided - if(Gradients.shape[0] > 1): - Mean = np.asarray(np.mean(Gradients, axis=0)) - Std = np.asarray(np.std(Gradients, axis=0)) - data = zip(Symbols, Mean, Std) - else: - data = zip(Symbols, np.asarray(Gradients)[0]) - - # sort by mean gradient for cohorts, gradient for individual samples - data = sorted(data, key=lambda x: np.abs(x[1]), reverse=True) - - # generate variables for visualization - if(Gradients.shape[1] > 1): - Means = [X[1] for X in data[0:N]] - Stdevs = [X[2] for X in data[0:N]] - Colors = [Red if X[1] > 0 else Blue for X in data[0:N]] - Labels = [X[0] for X in data[0:N]] - else: - Values = [X[1] for X in data[0:N]] - Colors = [Red if X[1] > 0 else Blue for X in data[0:N]] - Labels = [X[0] for X in data[0:N]] - - # generate plot - if(Gradients.shape[1] > 1): - trace = [go.Bar(x=Labels, y=Means, type='bar', - error_y=dict(type='data', array=Stdevs, visible=True), - name='Risk Gradient', - marker=dict(color=Colors))] - else: - trace = [go.Bar(x=Labels, y=Values, type='bar', - name='Risk Gradient', - marker=dict(color=Colors))] - py.offline.plot(trace, filename='RankedBar') + # generate mean values + Means = np.asarray(np.mean(Gradients, axis=0)) + # sort features by mean absolute gradient + Order = np.argsort(-np.abs(Means)) + + # generate figure and add axes + Figure = plt.figure(figsize=(BOX_FW, BOX_FH), facecolor='white') + Axes = Figure.add_axes([SPACING, SPACING, 1-2*SPACING, 1-2*SPACING], + frame_on=False) + Axes.set_axis_bgcolor('white') + + # generate boxplots + Box = Axes.boxplot(Gradients[:, Order[0:N]], + patch_artist=True, + showfliers=False) + + # set global properties + plt.setp(Box['medians'], color=MEDIAN, linewidth=1) + plt.setp(Box['whiskers'], color=WHISKER, linewidth=1, linestyle='-') + plt.setp(Box['caps'], color=WHISKER, linewidth=1) + + # modify box styling + for i, box in enumerate(Box['boxes']): + if Means[Order[i]] <= 0: + box.set(color=BLUEEDGE, linewidth=2) + box.set(facecolor=BLUEFACE) + else: + box.set(color=REDEDGE, linewidth=2) + box.set(facecolor=REDFACE) -def RankedBox(Gradients, Symbols, N=30): + # add jittered data overlays + for i in np.arange(N): + plt.scatter(np.random.normal(i+1, JITTER, size=Gradients.shape[0]), + Gradients[:, Order[i]], color=POINTS, alpha=0.2, + marker='o', s=2, + zorder=100) + + # set limits + Axes.set_ylim(1.05 * Gradients.min(), 1.05 * Gradients.max()) + + # format x axis + plt.xlabel('Model Features') + Fixed = _FixSymbols(Symbols) + Names = plt.setp(Axes, xticklabels=[Fixed[Order[i]] for i in np.arange(N)]) + plt.setp(Names, rotation=90, fontsize=10) + Axes.set_xticks(np.linspace(1.5, N-0.5, N-1), minor=True) + Axes.xaxis.set_ticks_position('bottom') + + # format y axis + plt.ylabel('Risk Gradient') + Axes.yaxis.set_ticks_position('left') + + # add grid lines and zero line + Axes.xaxis.grid(True, color=GRID, linestyle='-', which='minor') + plt.axhline(0, color='black') + + return Figure + + +def PairScatter(Gradients, Symbols, N=30): """ Generates boxplot series of feature gradients ranked by absolute magnitude. @@ -129,41 +198,68 @@ def RankedBox(Gradients, Symbols, N=30): """ - # generate mean values + # calculate means, standard deviations Means = np.asarray(np.mean(Gradients, axis=0)) + Std = np.asarray(np.std(Gradients, axis=0)) - # generate colors - Colors = [Red if mean > 0 else Blue for mean in Means] - - # zip data - data = zip(Symbols, Means, Colors, list(np.array(Gradients).transpose())) - - # sort by mean gradient for cohorts, gradient for individual samples - data = sorted(data, key=lambda x: np.abs(x[1]), reverse=True) + # sort features by mean absolute gradient + Order = np.argsort(-np.abs(Means)) + + # generate subplots + Figure, Axes = plt.subplots(nrows=N, ncols=N, + figsize=(PAIR_FW, PAIR_FW), + facecolor='white') + Figure.subplots_adjust(hspace=SPACING, wspace=SPACING, bottom=SPACING) + + # remove axes and ticks + for ax in Axes.flat: + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + + # generate scatter plots in lower triangular portion + for i, j in zip(*np.triu_indices_from(Axes, k=1)): + Axes[i, j].scatter((Gradients[:, Order[j]]-Means[Order[j]]) / + Std[Order[j]], + (Gradients[:, Order[i]]-Means[Order[i]]) / + Std[Order[i]], + color=POINTS, alpha=0.2, marker='o', s=2) + Smooth = lowess((Gradients[:, Order[j]]-Means[Order[j]]) / + Std[Order[j]], + (Gradients[:, Order[i]]-Means[Order[i]]) / + Std[Order[i]]) + Axes[i, j].plot(Smooth[:, 1], Smooth[:, 0], color='red') + + # generate histograms on diagonal + Fixed = _FixSymbols(Symbols, WRAP) + for i in np.arange(N): + if Means[Order[i]] <= 0: + Axes[i, i].hist(Gradients[:, Order[i]], + facecolor=BLUEFACE, + alpha=0.8) + else: + Axes[i, i].hist(Gradients[:, Order[i]], + facecolor=REDFACE, + alpha=0.8) + Axes[i, i].annotate(Fixed[Order[i]], (0, 0), + xycoords='axes fraction', + ha='right', va='top', + rotation=45) - # generate boxplot traces - Traces = [] - for Symbol, Mean, Color, Points in data[0:N]: - Traces.append(go.Box(y=Points, - name=Symbol, - jitter=0.5, - whiskerwidth=0.2, - boxpoints='all', - fillcolor=Color, - marker=dict(size=1, color=Color), - line=dict(width=1),)) + # delete unused axes + for i, j in zip(*np.tril_indices_from(Axes, k=-1)): + Figure.delaxes(Axes[i, j]) - py.offline.plot(Traces, filename='RankedBox') + return Figure -def PairScatter(Gradients, Symbols, N=30): +def KMPlots(Raw, Symbols, Survival, Censored, N=30): """ - Generates boxplot series of feature gradients ranked by absolute magnitude. + Generates KM plots for individual features ranked by absolute magnitude. Parameters: ---------- - Risk_Gradients: numpy matrix + Gradients: numpy matrix a matrix containing feature weights. Symbols: numpy nd array @@ -172,128 +268,176 @@ def PairScatter(Gradients, Symbols, N=30): N: integer value number of featurs to display in barchart. + Returns + ------- + Figures : figure handle + List containing handles to figures. + + Names : array_like + List of feature names for figures in 'Figures' + + Notes + ----- + Note this uses feature values as opposed to back-propagated risk gradients. """ - # calculate means, standard deviations - Means = np.asarray(np.mean(Gradients, axis=0)) - Std = np.asarray(np.std(Gradients, axis=0)) + # initialize list of figures and names + Figures = [] + Names = [] - # zip data - data = zip(Symbols, Means, Std, list(np.array(Gradients).transpose())) + # generate mean values + Means = np.asarray(np.mean(Raw, axis=0)) - # sort by mean gradient for cohorts, gradient for individual samples - data = sorted(data, key=lambda x: np.abs(x[1]), reverse=True) + # sort features by mean absolute gradient + Order = np.argsort(-np.abs(Means)) - # generate subplot titles - Titles = [data[0][0]] - for i in np.arange(1, N): - for j in np.arange(N): - Titles.append("") - Titles.append(data[i][0]) - Titles = tuple(Titles) + # generate Kaplan Meier fitter + kmf = KaplanMeierFitter() - # generate subplot matrix - Figure = py.tools.make_subplots(rows=N, cols=N, subplot_titles=Titles) + # generate KM plot for each feature + for count, i in enumerate(Order[0:N]): - # generate individual subplots - for i in np.arange(N): + # generate figure and axes + Figures.append(plt.figure(figsize=(SURV_FW, SURV_FH), + facecolor='white')) + Axes = Figures[count].add_axes([SPACING, SPACING, + 1-2*SPACING, 1-2*SPACING]) - # append scatter plot for each variable pair - for j in np.arange(i): - Figure.append_trace(go.Scatter(x=data[i][3] / data[i][2], - y=data[j][3] / data[j][2], - text=str(1.0), - mode='markers', - marker=dict(color='grey', - size=1)), - i+1, j+1) - - # add histograms on diagonal - Figure.append_trace(go.Histogram(x=np.array(Gradients[:, i] / - Std[i]).squeeze(), - marker=dict(color='red')), - i+1, i+1) + # generate names + Names.append(Symbols[i]) - for i in np.arange(N): + # extract suffix to classify feature + Suffix = Symbols[i][str.rfind(str(Symbols[i]), '_'):].strip() - # append scatter plot for each variable pair - for j in np.arange(i+1, N): - rho = np.sum(((data[i][3] - data[i][1]) / data[i][2]) * \ - ((data[j][3] - data[j][1]) / data[j][2])) - - Figure.append_trace(go.Scatter(x=[], - y=[], - text=str(rho), - mode='markers', - marker=dict(color='grey', - size=1)), - i+1, j+1) - - # add histograms on diagonal - Figure.append_trace(go.Histogram(x=np.array(Gradients[:, i] / - Std[i]).squeeze(), - marker=dict(color='red')), - i+1, i+1) - - # perform layouts for individual subtypes - for i in np.arange(N): - for j in np.arange(N): - - Index = i*N + j + 1 - - if (j < i): - - # calculate index of lower triangular plot - Index = i*N + j + 1 - - # update x,y axis layout for scatter plots - Figure['layout']['xaxis'+str(Index)].update(autorange=True, - showgrid=False, - zeroline=False, - showline=True, - autotick=True, - ticks='', - showticklabels=False, - linecolor='#636363', - linewidth=1) - Figure['layout']['yaxis'+str(Index)].update(autorange=True, - showgrid=False, - zeroline=False, - showline=True, - autotick=True, - ticks='', - showticklabels=False, - linecolor='#636363', - linewidth=1) - - elif j == i: - - # update histogram layouts - Figure['layout']['yaxis'+str(Index)].update(autorange=True, - showgrid=False, - zeroline=False, - showline=False, - autotick=True, - ticks='', - showticklabels=False) + if Suffix == '_Clinical': + + # get unique values to determine if binary or continuous + Unique = np.unique(Raw[:, i]) + + # process based on variable type + if Unique.size == 2: + + # extract and plot mutant and wild-type survival profiles + kmf.fit(Survival[Raw[:, i] == Unique[0]], + Censored[Raw[:, i] == Unique[0]] == 1, + label=Symbols[i] + str(Unique[0])) + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] == Unique[1]], + Censored[Raw[:, i] == Unique[1]] == 1, + label=Symbols[i] + str(Unique[1])) + kmf.plot(ax=Axes) + plt.ylim(0, 1) else: - # update x,y axis layout for scatter plots - Figure['layout']['xaxis'+str(Index)].update(autorange=True, - showgrid=False, - zeroline=False, - showline=False, - autotick=True, - ticks='', - showticklabels=False) - Figure['layout']['yaxis'+str(Index)].update(autorange=True, - showgrid=False, - zeroline=False, - showline=False, - autotick=True, - ticks='', - showticklabels=False) - - # generate plot - py.offline.plot(Figure, filename='PairScatter') + # determine median value + Median = np.median(Raw[:, i]) + + # extract and altered and unaltered survival profiles + kmf.fit(Survival[Raw[:, i] > Median], + Censored[Raw[:, i] > Median] == 1, + label=Symbols[i] + " > " + str(Median)) + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] <= Median], + Censored[Raw[:, i] <= Median] == 1, + label=Symbols[i] + " <= " + str(Median)) + kmf.plot(ax=Axes) + plt.ylim(0, 1) + + elif Suffix == '_Mut': + + # extract and plot mutant and wild-type survival profiles + kmf.fit(Survival[Raw[:, i] == 1], + Censored[Raw[:, i] == 1] == 1, + label=Symbols[i] + " Mutant") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] == 0], + Censored[Raw[:, i] == 0] == 1, + label=Symbols[i] + " Mutant") + kmf.plot(ax=Axes) + plt.ylim(0, 1) + + elif Suffix == '_CNV': + + # determine if alteration is amplification or deletion + Amplified = np.mean(Raw[:, i]) > 0 + + # extract and plot altered and unaltered survival profiles + if Amplified: + kmf.fit(Survival[Raw[:, i] > 0], + Censored[Raw[:, i] > 0] == 1, + label=Symbols[i] + " Amplified") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] <= 0], + Censored[Raw[:, i] <= 0] == 1, + label=Symbols[i] + " not Amplified") + kmf.plot(ax=Axes) + else: + kmf.fit(Survival[Raw[:, i] < 0], + Censored[Raw[:, i] < 0] == 1, + label=Symbols[i] + " Deleted") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] >= 0], + Censored[Raw[:, i] >= 0] == 1, + label=Symbols[i] + " not Deleted") + kmf.plot(ax=Axes) + plt.ylim(0, 1) + + elif Suffix == '_CNVArm': + + # determine if alteration is amplification or deletion + Amplified = np.mean(Raw[:, i]) > 0 + + # extract and plot altered and unaltered survival profiles + if Amplified: + kmf.fit(Survival[Raw[:, i] > 0.25], + Censored[Raw[:, i] > 0.25] == 1, + label=Symbols[i] + " Amplified") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] <= 0.25], + Censored[Raw[:, i] <= 0.25] == 1, + label=Symbols[i] + " not Amplified") + kmf.plot(ax=Axes) + else: + kmf.fit(Survival[Raw[:, i] < -0.25], + Censored[Raw[:, i] < -0.25] == 1, + label=Symbols[i] + " Deleted") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] >= -0.25], + Censored[Raw[:, i] >= -0.25] == 1, + label=Symbols[i] + " not Deleted") + kmf.plot(ax=Axes) + plt.ylim(0, 1) + + elif (Suffix == '_Protein') or (Suffix == '_mRNA'): + + # determine median expression + Median = np.median(Raw[:, i]) + + # extract and altered and unaltered survival profiles + kmf.fit(Survival[Raw[:, i] > Median], + Censored[Raw[:, i] > Median] == 1, + label=Symbols[i] + " Higher Expression") + kmf.plot(ax=Axes) + kmf.fit(Survival[Raw[:, i] <= Median], + Censored[Raw[:, i] <= Median] == 1, + label=Symbols[i] + " Lower Expression") + kmf.plot(ax=Axes) + plt.ylim(0, 1) + + else: + raise ValueError('Unrecognized feature type') + + return Figures, Names + + +def _FixSymbols(Symbols, Length=WRAP): + """ + Removes trailing and leading whitespace and wraps long labels + """ + + # remove whitespace and wrap + Fixed = ['\n'.join(wrap(Symbol.strip().replace('_', ' '), Length)) + for Symbol in Symbols] + + return Fixed From 4dc87b7932407e38992758fe93e83f5ddb3a755f Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Fri, 19 Aug 2016 00:52:08 -0400 Subject: [PATCH 2/5] Updated submodule name to "analysis". --- survivalnet/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/survivalnet/__init__.py b/survivalnet/__init__.py index b02d01a..c1d408a 100644 --- a/survivalnet/__init__.py +++ b/survivalnet/__init__.py @@ -8,7 +8,7 @@ from .train import train # sub-packages with no internal dependencies -from . import sensitivity +from . import analysis # must be imported before Bayesian_Optimizaiton #from .CostFunction import cost_func, aggr_st_cost_func, st_cost_func @@ -26,5 +26,5 @@ # sub-packages 'model', 'optimization', - 'sensitivity', + 'analysis', ) From d27dc06e84afd1323b47dd3d76086d79854f55c4 Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Fri, 19 Aug 2016 00:53:41 -0400 Subject: [PATCH 3/5] Moved matrix conversion into function. --- survivalnet/analysis/RiskCohort.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/survivalnet/analysis/RiskCohort.py b/survivalnet/analysis/RiskCohort.py index c358b23..d3ef7ef 100644 --- a/survivalnet/analysis/RiskCohort.py +++ b/survivalnet/analysis/RiskCohort.py @@ -36,9 +36,12 @@ def RiskCohort(Model, Features): # initialize container for risk gradient profiles Gradients = np.zeros(Features.shape) + # copy input to matrix for Theano + Matrix = np.matrix(Features) + # iterate through samples, calculating risk gradient profile for each for i in np.arange(Features.shape[0]): - Gradients[i, :] = _RiskBackpropagate(Model, Features[i, :]) + Gradients[i, :] = _RiskBackpropagate(Model, Matrix[i, :]) return Gradients From cf7f54d9f92efbca6af12959f1e8305b113b9d96 Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Fri, 19 Aug 2016 01:20:50 -0400 Subject: [PATCH 4/5] Added stdout messaging. Removed unused input parameter. Reduced default value for N. --- survivalnet/analysis/Visualize.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/survivalnet/analysis/Visualize.py b/survivalnet/analysis/Visualize.py index 54a0d44..7268e72 100644 --- a/survivalnet/analysis/Visualize.py +++ b/survivalnet/analysis/Visualize.py @@ -35,7 +35,7 @@ def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, - GeneSet=False, N=30, Tau=0.05, Path=None): + N=10, Tau=0.05, Path=None): """ Generate visualizations of risk profiles. Backpropagation is used to @@ -72,21 +72,27 @@ def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, Corrected[i] = Prefix[i] + Suffix[i] # generate risk derivative profiles for cohort + print "Generting risk gradient profiles..." Gradients = RiskCohort(Model, Normalized) # generate ranked box plot series + print "Generating risk gradient boxplot..." RBFig = RankedBox(Gradients, Symbols, N) # generate paired scatter plot + print "Generating paired scatter plots..." PSFig = PairScatter(Gradients, Symbols, N) # generate cluster plot + print "Generating cluster analysis..." CFig = RiskCluster(Gradients, Raw, Symbols, N, Tau) # generate Kaplan-Meier plots for individual features + print "Generating Kaplan-Meier plots..." KMFigs, KMNames = KMPlots(Raw, Symbols, Survival, Censored, N) # save figures + print "Saving figures..." if Path is not None: # save standard figures From 0ef86deb195914ff78c4b97b1dff075c3cb07e76 Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Fri, 19 Aug 2016 02:55:29 -0400 Subject: [PATCH 5/5] Fixed imports, added docstrings. --- survivalnet/analysis/RiskCluster.py | 8 +- survivalnet/analysis/Visualize.py | 115 ++++++++++++++++++--------- survivalnet/analysis/__init__.py | 4 +- survivalnet/optimization/__init__.py | 1 - 4 files changed, 85 insertions(+), 43 deletions(-) diff --git a/survivalnet/analysis/RiskCluster.py b/survivalnet/analysis/RiskCluster.py index a8946df..d66af47 100644 --- a/survivalnet/analysis/RiskCluster.py +++ b/survivalnet/analysis/RiskCluster.py @@ -1,5 +1,5 @@ import matplotlib as mpl -import matplotlib.pyplot as pylab +import matplotlib.pyplot as plt import numpy as np import scipy.cluster.hierarchy as sch import scipy.spatial.distance as dist @@ -85,7 +85,7 @@ def RiskCluster(Gradients, Raw, Symbols, N=30, Tau=0.05): Normalized = Normalized.transpose() # generate figure - Figure = pylab.figure(figsize=(WINDOW_WIDTH, WINDOW_HEIGHT)) + Figure = plt.figure(figsize=(WINDOW_WIDTH, WINDOW_HEIGHT)) # cluster samples and generate dendrogram SampleDist = dist.pdist(Normalized.T, 'correlation') @@ -137,7 +137,7 @@ def RiskCluster(Gradients, Raw, Symbols, N=30, Tau=0.05): Heatmap = Figure.add_axes([HEATMAP_X, HEATMAP_Y, HEATMAP_W, HEATMAP_H], frame_on=False) Heatmap.matshow(Reordered, aspect='auto', origin='lower', - cmap=pylab.cm.bwr) + cmap=plt.cm.bwr) Heatmap.set_xticks([]) Heatmap.set_yticks([]) @@ -175,7 +175,7 @@ def RiskCluster(Gradients, Raw, Symbols, N=30, Tau=0.05): cnv = Figure.add_axes([TRACK_X, TRACK_Y, TRACK_W, TRACK_H - len(SigMut)*TRACK], frame_on=False) - cnv.matshow(CNVs, aspect='auto', origin='lower', cmap=pylab.cm.bwr) + cnv.matshow(CNVs, aspect='auto', origin='lower', cmap=plt.cm.bwr) for i in range(len(SigCNV)): cnv.text(-SPACING, i / np.float(len(SigCNV)) + 1/np.float(2*len(SigCNV)), diff --git a/survivalnet/analysis/Visualize.py b/survivalnet/analysis/Visualize.py index 7268e72..bbf86f4 100644 --- a/survivalnet/analysis/Visualize.py +++ b/survivalnet/analysis/Visualize.py @@ -4,8 +4,8 @@ from statsmodels.nonparametric.smoothers_lowess import lowess from textwrap import wrap -from . import RiskCohort -from . import RiskCluster +from .RiskCohort import RiskCohort +from .RiskCluster import RiskCluster # define colors for positive risk (red) and negative risk (blue) REDFACE = '#DE2D26' @@ -41,19 +41,39 @@ def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, Parameters: ----------- - Model : class - Model defined by finetuning + Model generated by finetuning. + + Normalized : array_like + Numpy array containing normalized feature values used in training / + finetuning. These are used to examine associations between feature values + and cluster assignments. Features are in columns and samples are in rows. - Features : array_like - An N x P array containing the normalized (z-scored) features used in model - finetuning. Contains P features from N total patients. + Raw : array_like + Numpy array containing raw, unnormalized feature values. These are used to + examine associations between feature values and cluster assignments. + Features are in columns and samples are in rows. Symbols : array_like - P-length list of strings describing model inputs + List containing strings describing features. See Notes below for + restrictions on symbol names. + + Survival : array_like + Array containing death or last followup values. + + Censored : array_like + Array containing vital status at last followup. 1 (alive) or 0 (deceased). - N : integer - Number of features to analyze. + N : scalar + Number of features to include in analysis. Features are scored by absolute + mean gradient and the highest N magnitude features will be used + to generate the plot. + + Tau : scalar + Threshold for statistical significance when examining cluster associations. + + Path : string + Path to store .pdf versions of plots generated. """ # modify duplicate symbols where needed - append index to each instance @@ -85,11 +105,11 @@ def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, # generate cluster plot print "Generating cluster analysis..." - CFig = RiskCluster(Gradients, Raw, Symbols, N, Tau) + CFig, Labels = RiskCluster(Gradients, Raw, Symbols, N, Tau) # generate Kaplan-Meier plots for individual features print "Generating Kaplan-Meier plots..." - KMFigs, KMNames = KMPlots(Raw, Symbols, Survival, Censored, N) + KMFigs, KMNames = KMPlots(Gradients, Raw, Symbols, Survival, Censored, N) # save figures print "Saving figures..." @@ -103,21 +123,24 @@ def Visualize(Model, Normalized, Raw, Symbols, Survival, Censored, Figure.savefig(Path + 'KM.' + KMNames[i] + '.pdf') -def RankedBox(Gradients, Symbols, N=30): +def RankedBox(Gradients, Symbols, N=10): """ Generates boxplot series of feature gradients ranked by absolute magnitude. Parameters: ---------- + Gradients : array_like + Numpy array containing feature/sample gradients obtained by RiskCohort. + Features are in columns and samples are in rows. - Gradients: numpy matrix - a matrix containing feature weights. - - Symbols: numpy nd array - a matrix of feature Symbols. + Symbols : array_like + List containing strings describing features. See Notes below for + restrictions on symbol names. - N: integer value - number of featurs to display in barchart. + N : scalar + Number of features to include in analysis. Features are scored by absolute + mean gradient and the highest N magnitude features will be used + to generate the plot. Returns ------- @@ -186,22 +209,25 @@ def RankedBox(Gradients, Symbols, N=30): return Figure -def PairScatter(Gradients, Symbols, N=30): +def PairScatter(Gradients, Symbols, N=10): """ Generates boxplot series of feature gradients ranked by absolute magnitude. Parameters: ---------- - Risk_Gradients: numpy matrix - a matrix containing feature weights. + Gradients : array_like + Numpy array containing feature/sample gradients obtained by RiskCohort. + Features are in columns and samples are in rows. - Symbols: numpy nd array - a matrix of feature Symbols. - - N: integer value - number of featurs to display in barchart. + Symbols : array_like + List containing strings describing features. See Notes below for + restrictions on symbol names. + N : scalar + Number of features to include in analysis. Features are scored by absolute + mean gradient and the highest N magnitude features will be used + to generate the plot. """ # calculate means, standard deviations @@ -258,21 +284,36 @@ def PairScatter(Gradients, Symbols, N=30): return Figure -def KMPlots(Raw, Symbols, Survival, Censored, N=30): +def KMPlots(Gradients, Raw, Symbols, Survival, Censored, N=10): """ Generates KM plots for individual features ranked by absolute magnitude. Parameters: ---------- - Gradients: numpy matrix - a matrix containing feature weights. + Gradients : array_like + Numpy array containing feature/sample gradients obtained by RiskCohort. + Features are in columns and samples are in rows. - Symbols: numpy nd array - a matrix of feature Symbols. + Raw : array_like + Numpy array containing raw, unnormalized feature values. These are used to + examine associations between feature values and cluster assignments. + Features are in columns and samples are in rows. + + Symbols : array_like + List containing strings describing features. See Notes below for + restrictions on symbol names. - N: integer value - number of featurs to display in barchart. + Survival : array_like + Array containing death or last followup values. + + Censored : array_like + Array containing vital status at last followup. 1 (alive) or 0 (deceased). + + N : scalar + Number of features to include in analysis. Features are scored by absolute + mean gradient and the highest N magnitude features will be used + to generate the plot. Returns ------- @@ -284,6 +325,8 @@ def KMPlots(Raw, Symbols, Survival, Censored, N=30): Notes ----- + Suffixes like '_Mut' and '_CNV' that are generated by the package + tcgaintegrator to identify feature types are required analysis. Note this uses feature values as opposed to back-propagated risk gradients. """ @@ -292,7 +335,7 @@ def KMPlots(Raw, Symbols, Survival, Censored, N=30): Names = [] # generate mean values - Means = np.asarray(np.mean(Raw, axis=0)) + Means = np.asarray(np.mean(Gradients, axis=0)) # sort features by mean absolute gradient Order = np.argsort(-np.abs(Means)) diff --git a/survivalnet/analysis/__init__.py b/survivalnet/analysis/__init__.py index a243f21..051fca3 100644 --- a/survivalnet/analysis/__init__.py +++ b/survivalnet/analysis/__init__.py @@ -2,15 +2,15 @@ # must be imported after RiskCohort from .Visualize import PairScatter -from .Visualize import RankedBar from .Visualize import RankedBox +from .RiskCluster import RiskCluster from .Visualize import Visualize # list functions and classes available for public use __all__ = ( 'PairScatter', - 'RankedBar', 'RankedBox', + 'RiskCluster', 'RiskCohort', 'Visualize', ) diff --git a/survivalnet/optimization/__init__.py b/survivalnet/optimization/__init__.py index 698942b..ff05471 100644 --- a/survivalnet/optimization/__init__.py +++ b/survivalnet/optimization/__init__.py @@ -11,7 +11,6 @@ 'BFGS', 'isOverfitting', 'GLDS', - 'LineSearch', 'Optimization', 'SurvivalAnalysis', )